From 54a75ecb3893dbefe6b5420b137aedddf922b504 Mon Sep 17 00:00:00 2001 From: xiebaiyuan Date: Wed, 23 Sep 2020 19:40:56 +0800 Subject: [PATCH] remove paddle mobile old project , never say good bye (#4421) * remove paddle mobile old project. never say good bye * test=develop --- CMakeLists.txt | 6 - mobile/.clang-format | 5 - mobile/.clang-tidy | 67 - mobile/.gitignore | 104 - mobile/.pre-commit-config.yaml | 69 - mobile/.travis.yml | 36 - mobile/.travis/pre-commit-job.sh | 21 - mobile/CMakeLists.txt | 293 - mobile/CONTRIBUTING.md | 234 - mobile/Dockerfile | 38 - mobile/LICENSE | 204 - mobile/README.md | 137 - mobile/benchmark/arm_benchmark.md | 36 - mobile/benchmark/metal_benchmark.md | 10 - mobile/demo/ReadMe.md | 10 - mobile/demo/getDemo.sh | 8 - mobile/doc/build.md | 63 - mobile/doc/design_doc.md | 171 - mobile/doc/development_android.md | 189 - mobile/doc/development_android_GPU.md | 77 - mobile/doc/development_arm_linux.md | 62 - mobile/doc/development_fpga.md | 5 - mobile/doc/development_ios.md | 85 - mobile/doc/quantification.md | 33 - mobile/src/common/common.h | 31 - mobile/src/common/enforce.h | 73 - mobile/src/common/log.h | 283 - mobile/src/common/threadpool.h | 126 - mobile/src/common/type_define.h | 187 - mobile/src/common/types.cpp | 266 - mobile/src/common/types.h | 277 - mobile/src/common/util.cpp | 46 - mobile/src/common/util.h | 26 - mobile/src/common/variant.h | 106 - mobile/src/fpga/KD/alignment.h | 32 - mobile/src/fpga/KD/context.hpp | 55 - mobile/src/fpga/KD/dl_engine.cpp | 15 - mobile/src/fpga/KD/dl_engine.hpp | 33 - mobile/src/fpga/KD/float16.hpp | 506 -- mobile/src/fpga/KD/layout.hpp | 99 - mobile/src/fpga/KD/llapi/bias_scale.cpp | 100 - mobile/src/fpga/KD/llapi/bias_scale.h | 29 - mobile/src/fpga/KD/llapi/config.h | 19 - mobile/src/fpga/KD/llapi/filter.cpp | 346 - mobile/src/fpga/KD/llapi/filter.h | 54 - mobile/src/fpga/KD/llapi/image.cpp | 149 - mobile/src/fpga/KD/llapi/image.h | 38 - mobile/src/fpga/KD/llapi/zynqmp_api.cpp | 384 -- mobile/src/fpga/KD/llapi/zynqmp_api.h | 329 - mobile/src/fpga/KD/pe.hpp | 45 - mobile/src/fpga/KD/pe_params.hpp | 179 - mobile/src/fpga/KD/pes/concat_pe.hpp | 70 - mobile/src/fpga/KD/pes/conv_pe.hpp | 96 - mobile/src/fpga/KD/pes/conv_process.hpp | 374 -- mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp | 98 - mobile/src/fpga/KD/pes/elementwise_add_pe.hpp | 74 - mobile/src/fpga/KD/pes/fully_connected_pe.hpp | 98 - mobile/src/fpga/KD/pes/input_pe.hpp | 53 - mobile/src/fpga/KD/pes/math_func_neon.h | 330 - mobile/src/fpga/KD/pes/output_pe.hpp | 52 - mobile/src/fpga/KD/pes/pooling_pe.hpp | 72 - mobile/src/fpga/KD/pes/softmax_pe.cpp | 162 - mobile/src/fpga/KD/pes/softmax_pe.hpp | 44 - mobile/src/fpga/KD/shape.hpp | 112 - mobile/src/fpga/KD/tensor.hpp | 281 - mobile/src/fpga/KD/tensor_util.cpp | 31 - mobile/src/fpga/KD/tensor_util.hpp | 25 - mobile/src/fpga/V1/api.cpp | 1021 --- mobile/src/fpga/V1/api.h | 102 - mobile/src/fpga/V1/bias_scale.cpp | 102 - mobile/src/fpga/V1/bias_scale.h | 29 - mobile/src/fpga/V1/deconv_bias_scale.cpp | 48 - mobile/src/fpga/V1/deconv_bias_scale.h | 26 - mobile/src/fpga/V1/deconv_filter.cpp | 280 - mobile/src/fpga/V1/deconv_filter.h | 39 - mobile/src/fpga/V1/filter.cpp | 362 -- mobile/src/fpga/V1/filter.h | 50 - mobile/src/fpga/V1/image.cpp | 138 - mobile/src/fpga/V1/image.h | 76 - mobile/src/fpga/V1/pe.cpp | 1180 ---- mobile/src/fpga/V2/api.cpp | 1011 --- mobile/src/fpga/V2/api.h | 94 - mobile/src/fpga/V2/bias_scale.cpp | 116 - mobile/src/fpga/V2/bias_scale.h | 29 - mobile/src/fpga/V2/deconv_bias_scale.cpp | 48 - mobile/src/fpga/V2/deconv_bias_scale.h | 26 - mobile/src/fpga/V2/deconv_filter.cpp | 280 - mobile/src/fpga/V2/deconv_filter.h | 39 - mobile/src/fpga/V2/filter.cpp | 362 -- mobile/src/fpga/V2/filter.h | 50 - mobile/src/fpga/V2/image.cpp | 144 - mobile/src/fpga/V2/image.h | 71 - mobile/src/fpga/V2/pe.cpp | 1138 ---- mobile/src/fpga/common/config.h | 18 - mobile/src/fpga/common/driver.cpp | 296 - mobile/src/fpga/common/driver.h | 141 - mobile/src/fpga/common/fpga_common.cpp | 214 - mobile/src/fpga/common/fpga_common.h | 331 - mobile/src/fpga/common/pe.h | 35 - mobile/src/framework/CMakeLists.txt | 0 mobile/src/framework/attribute.cpp | 40 - mobile/src/framework/attribute.h | 183 - mobile/src/framework/cl/cl_deleter.h | 65 - mobile/src/framework/cl/cl_engine.cpp | 91 - mobile/src/framework/cl/cl_engine.h | 256 - mobile/src/framework/cl/cl_half.cpp | 518 -- mobile/src/framework/cl/cl_half.h | 32 - mobile/src/framework/cl/cl_helper.h | 94 - mobile/src/framework/cl/cl_image.cpp | 187 - mobile/src/framework/cl/cl_image.h | 338 - .../src/framework/cl/cl_image_converter.cpp | 510 -- mobile/src/framework/cl/cl_image_converter.h | 121 - mobile/src/framework/cl/cl_scope.h | 129 - mobile/src/framework/cl/cl_tensor.h | 193 - mobile/src/framework/cl/cl_tool.cpp | 84 - mobile/src/framework/cl/cl_tool.h | 35 - mobile/src/framework/context.cpp | 605 -- mobile/src/framework/context.h | 79 - mobile/src/framework/data_layout.h | 63 - mobile/src/framework/data_type.cpp | 106 - mobile/src/framework/data_type.h | 80 - mobile/src/framework/ddim.cpp | 327 - mobile/src/framework/ddim.h | 192 - mobile/src/framework/dim.h | 335 - mobile/src/framework/executor.cpp | 1125 ---- mobile/src/framework/executor.h | 126 - mobile/src/framework/framework.pb-c.cpp | 1465 ----- mobile/src/framework/framework.pb-c.h | 615 -- mobile/src/framework/framework.proto | 196 - mobile/src/framework/load_ops.h | 388 -- mobile/src/framework/loader.cpp | 310 - mobile/src/framework/loader.h | 66 - mobile/src/framework/lod_tensor.cpp | 192 - mobile/src/framework/lod_tensor.h | 234 - mobile/src/framework/mixed_vector.h | 271 - mobile/src/framework/op_info.h | 96 - mobile/src/framework/op_kernel_type.h | 60 - mobile/src/framework/op_proto_maker.h | 22 - mobile/src/framework/op_registry.h | 125 - mobile/src/framework/operator.cpp | 172 - mobile/src/framework/operator.h | 211 - mobile/src/framework/program/block_desc.cpp | 44 - mobile/src/framework/program/block_desc.h | 86 - mobile/src/framework/program/op_desc.cpp | 100 - mobile/src/framework/program/op_desc.h | 78 - .../program-optimize/fusion_op_register.h | 82 - .../program/program-optimize/node.cpp | 281 - .../framework/program/program-optimize/node.h | 81 - .../program-optimize/program_optimize.cpp | 300 - .../program-optimize/program_optimize.h | 45 - mobile/src/framework/program/program.h | 41 - mobile/src/framework/program/program_desc.cpp | 118 - mobile/src/framework/program/program_desc.h | 62 - mobile/src/framework/program/tensor_desc.h | 75 - mobile/src/framework/program/var_desc.h | 80 - mobile/src/framework/scope.cpp | 155 - mobile/src/framework/scope.h | 113 - mobile/src/framework/selected_rows.cpp | 127 - mobile/src/framework/selected_rows.h | 138 - mobile/src/framework/tensor.h | 355 -- mobile/src/framework/tensor_base.h | 148 - mobile/src/framework/tensor_util.cpp | 30 - mobile/src/framework/tensor_util.h | 39 - mobile/src/framework/type_trait.h | 44 - mobile/src/framework/variable.h | 96 - mobile/src/framework/zynqmp/ztensor.hpp | 312 - mobile/src/io/api.cc | 86 - mobile/src/io/api_paddle_mobile.cc | 326 - mobile/src/io/api_paddle_mobile.h | 57 - mobile/src/io/ios_io/PaddleMobileCPU.h | 184 - mobile/src/io/ios_io/PaddleMobileCPU.mm | 410 -- mobile/src/io/jni/PML.java | 66 - mobile/src/io/jni/paddle_mobile_jni.cpp | 465 -- mobile/src/io/jni/paddle_mobile_jni.h | 91 - mobile/src/io/loader.h | 49 - mobile/src/io/opencl_interface.cpp | 53 - mobile/src/io/opencl_interface.h | 28 - mobile/src/io/paddle_inference_api.h | 238 - mobile/src/io/paddle_mobile.cpp | 550 -- mobile/src/io/paddle_mobile.h | 134 - mobile/src/io/paddle_mobile_wrap.cpp | 361 -- mobile/src/io/paddle_mobile_wrap.h | 97 - mobile/src/io/paddle_test_inference_api.cpp | 36 - mobile/src/io/paddle_test_inference_api.h | 35 - mobile/src/memory/t_malloc.cpp | 92 - mobile/src/memory/t_malloc.h | 63 - mobile/src/operators/activation_op.cpp | 105 - mobile/src/operators/activation_op.h | 47 - mobile/src/operators/assign_op.cpp | 39 - mobile/src/operators/assign_op.h | 33 - mobile/src/operators/assign_value_op.cpp | 41 - mobile/src/operators/assign_value_op.h | 33 - mobile/src/operators/batchnorm_op.cpp | 44 - mobile/src/operators/batchnorm_op.h | 48 - .../src/operators/beam_search_decode_op.cpp | 34 - mobile/src/operators/beam_search_decode_op.h | 32 - mobile/src/operators/beam_search_op.cpp | 34 - mobile/src/operators/beam_search_op.h | 31 - mobile/src/operators/bilinear_interp_op.cpp | 76 - mobile/src/operators/bilinear_interp_op.h | 48 - mobile/src/operators/box_coder_op.cpp | 64 - mobile/src/operators/box_coder_op.h | 49 - mobile/src/operators/cast_op.cpp | 36 - mobile/src/operators/cast_op.h | 45 - mobile/src/operators/compare_op.cpp | 45 - mobile/src/operators/compare_op.h | 34 - mobile/src/operators/concat_op.cpp | 77 - mobile/src/operators/concat_op.h | 45 - mobile/src/operators/conditional_block_op.cpp | 34 - mobile/src/operators/conditional_block_op.h | 34 - .../tensor_array_read_write_op.cpp | 43 - .../controlflow/tensor_array_read_write_op.h | 34 - mobile/src/operators/controlflow/while_op.cpp | 36 - mobile/src/operators/controlflow/while_op.h | 30 - mobile/src/operators/conv_op.cpp | 67 - mobile/src/operators/conv_op.h | 45 - mobile/src/operators/conv_transpose_op.cpp | 36 - mobile/src/operators/conv_transpose_op.h | 97 - mobile/src/operators/crf_op.cpp | 55 - mobile/src/operators/crf_op.h | 46 - mobile/src/operators/depthwise_conv_op.cpp | 62 - mobile/src/operators/depthwise_conv_op.h | 43 - mobile/src/operators/dequantize_op.cpp | 36 - mobile/src/operators/dequantize_op.h | 46 - mobile/src/operators/detection_ops.cpp | 145 - mobile/src/operators/detection_ops.h | 46 - mobile/src/operators/dropout_op.cpp | 40 - mobile/src/operators/dropout_op.h | 49 - mobile/src/operators/elementwise_add_op.cpp | 44 - mobile/src/operators/elementwise_add_op.h | 47 - mobile/src/operators/elementwise_mul_op.cpp | 42 - mobile/src/operators/elementwise_mul_op.h | 51 - mobile/src/operators/elementwise_sub_op.cpp | 41 - mobile/src/operators/elementwise_sub_op.h | 51 - mobile/src/operators/exp_op.cpp | 36 - mobile/src/operators/exp_op.h | 30 - mobile/src/operators/expand_op.cpp | 49 - mobile/src/operators/expand_op.h | 35 - mobile/src/operators/feed_op.cpp | 47 - mobile/src/operators/feed_op.h | 45 - mobile/src/operators/fetch_op.cpp | 39 - mobile/src/operators/fetch_op.h | 44 - .../fill_constant_batch_size_like_op.cpp | 25 - .../fill_constant_batch_size_like_op.h | 96 - mobile/src/operators/fill_constant_op.cpp | 27 - mobile/src/operators/fill_constant_op.h | 79 - mobile/src/operators/flatten2_op.cpp | 48 - mobile/src/operators/flatten2_op.h | 34 - mobile/src/operators/flatten_op.cpp | 52 - mobile/src/operators/flatten_op.h | 71 - .../src/operators/fusion_conv_add_bn_op.cpp | 61 - mobile/src/operators/fusion_conv_add_bn_op.h | 76 - .../operators/fusion_conv_add_bn_relu_op.cpp | 64 - .../operators/fusion_conv_add_bn_relu_op.h | 77 - mobile/src/operators/fusion_conv_add_op.cpp | 64 - mobile/src/operators/fusion_conv_add_op.h | 66 - .../src/operators/fusion_conv_add_relu_op.cpp | 62 - .../src/operators/fusion_conv_add_relu_op.h | 68 - .../operators/fusion_conv_bn_add_relu_op.cpp | 65 - .../operators/fusion_conv_bn_add_relu_op.h | 83 - mobile/src/operators/fusion_conv_bn_op.cpp | 61 - mobile/src/operators/fusion_conv_bn_op.h | 72 - .../src/operators/fusion_conv_bn_relu_op.cpp | 64 - mobile/src/operators/fusion_conv_bn_relu_op.h | 74 - mobile/src/operators/fusion_conv_relu_op.cpp | 64 - mobile/src/operators/fusion_conv_relu_op.h | 66 - .../src/operators/fusion_deconv_add_bn_op.cpp | 32 - .../src/operators/fusion_deconv_add_bn_op.h | 116 - .../fusion_deconv_add_bn_relu_op.cpp | 33 - .../operators/fusion_deconv_add_bn_relu_op.h | 118 - mobile/src/operators/fusion_deconv_add_op.cpp | 32 - mobile/src/operators/fusion_deconv_add_op.h | 108 - .../operators/fusion_deconv_add_relu_op.cpp | 33 - .../src/operators/fusion_deconv_add_relu_op.h | 110 - .../operators/fusion_deconv_bn_relu_op.cpp | 32 - .../src/operators/fusion_deconv_bn_relu_op.h | 115 - .../src/operators/fusion_deconv_relu_op.cpp | 31 - mobile/src/operators/fusion_deconv_relu_op.h | 107 - .../operators/fusion_dequant_add_bn_op.cpp | 38 - .../src/operators/fusion_dequant_add_bn_op.h | 75 - .../fusion_dequant_add_bn_relu_op.cpp | 40 - .../operators/fusion_dequant_add_bn_relu_op.h | 77 - .../fusion_dequant_add_bn_relu_quant_op.cpp | 62 - .../fusion_dequant_add_bn_relu_quant_op.h | 123 - mobile/src/operators/fusion_dequant_bn_op.cpp | 54 - mobile/src/operators/fusion_dequant_bn_op.h | 101 - .../src/operators/fusion_dequant_bn_relu_op.h | 74 - .../operators/fusion_dwconv_bn_relu_op.cpp | 63 - .../src/operators/fusion_dwconv_bn_relu_op.h | 76 - .../fusion_elementwise_add_relu_op.cpp | 44 - .../fusion_elementwise_add_relu_op.h | 68 - mobile/src/operators/fusion_fc_op.cpp | 70 - mobile/src/operators/fusion_fc_op.h | 64 - mobile/src/operators/fusion_fc_relu_op.cpp | 67 - mobile/src/operators/fusion_fc_relu_op.h | 66 - .../operators/fusion_instancenorm_relu_op.cpp | 39 - .../operators/fusion_instancenorm_relu_op.h | 68 - mobile/src/operators/grid_sampler_op.cpp | 36 - mobile/src/operators/grid_sampler_op.h | 35 - mobile/src/operators/gru_op.cpp | 66 - mobile/src/operators/gru_op.h | 46 - mobile/src/operators/gru_unit_op.cpp | 69 - mobile/src/operators/gru_unit_op.h | 44 - mobile/src/operators/im2sequence_op.cpp | 55 - mobile/src/operators/im2sequence_op.h | 48 - mobile/src/operators/increment_op.cpp | 49 - mobile/src/operators/increment_op.h | 48 - mobile/src/operators/instancenorm_op.cpp | 39 - mobile/src/operators/instancenorm_op.h | 48 - mobile/src/operators/is_empty_op.cpp | 44 - mobile/src/operators/is_empty_op.h | 47 - .../src/operators/kernel/activation_kernel.h | 44 - .../kernel/arm/activation_kernel.cpp | 116 - .../kernel/arm/anchor_generator_kernel.cpp | 37 - .../operators/kernel/arm/assign_kernel.cpp | 39 - .../kernel/arm/assign_value_kernel.cpp | 87 - .../operators/kernel/arm/batchnorm_kernel.cpp | 36 - .../kernel/arm/beam_search_decode_kernel.cpp | 278 - .../kernel/arm/beam_search_kernel.cpp | 262 - .../kernel/arm/bilinear_interp_kernel.cpp | 37 - .../operators/kernel/arm/box_coder_kernel.cpp | 36 - .../src/operators/kernel/arm/cast_kernel.cpp | 84 - .../operators/kernel/arm/compare_kernel.cpp | 274 - .../operators/kernel/arm/concat_kernel.cpp | 41 - .../kernel/arm/conditional_block_kernel.cpp | 100 - .../convolution/conv_add_bn_relu_kernel.cpp | 178 - .../arm/convolution/conv_add_kernel.cpp | 79 - .../arm/convolution/conv_add_relu_kernel.cpp | 77 - .../convolution/conv_bn_add_relu_kernel.cpp | 96 - .../arm/convolution/conv_bn_relu_kernel.cpp | 146 - .../kernel/arm/convolution/conv_common.cpp | 116 - .../kernel/arm/convolution/conv_common.h | 25 - .../kernel/arm/convolution/conv_kernel.cpp | 75 - .../arm/convolution/conv_relu_kernel.cpp | 66 - .../arm/convolution/conv_transpose_kernel.cpp | 39 - .../arm/convolution/dwconv_bn_relu_kernel.cpp | 95 - .../src/operators/kernel/arm/crf_kernel.cpp | 39 - .../kernel/arm/density_prior_box_kernel.cpp | 37 - .../kernel/arm/dequantize_bn_kernel.cpp | 340 - .../kernel/arm/dequantize_kernel.cpp | 81 - .../operators/kernel/arm/dropout_kernel.cpp | 51 - .../kernel/arm/elementwise_add_kernel.cpp | 43 - .../kernel/arm/elementwise_mul_kernel.cpp | 38 - .../kernel/arm/elementwise_sub_kernel.cpp | 38 - .../src/operators/kernel/arm/exp_kernel.cpp | 47 - .../src/operators/kernel/arm/feed_kernel.cpp | 35 - .../src/operators/kernel/arm/fetch_kernel.cpp | 31 - .../operators/kernel/arm/flatten_kernel.cpp | 36 - .../operators/kernel/arm/fusion_fc_kernel.cpp | 75 - .../src/operators/kernel/arm/gru_kernel.cpp | 39 - .../operators/kernel/arm/gru_unit_kernel.cpp | 38 - .../kernel/arm/im2sequence_kernel.cpp | 87 - .../operators/kernel/arm/increment_kernel.cpp | 36 - .../operators/kernel/arm/is_empty_kernel.cpp | 37 - .../operators/kernel/arm/lod_reset_kernel.cpp | 68 - .../operators/kernel/arm/logical_kernel.cpp | 125 - .../operators/kernel/arm/lookup_kernel.cpp | 36 - .../src/operators/kernel/arm/lrn_kernel.cpp | 36 - .../src/operators/kernel/arm/mul_kernel.cpp | 39 - .../kernel/arm/multiclass_nms_kernel.cpp | 37 - .../kernel/arm/nearest_interp_kernel.cpp | 88 - .../src/operators/kernel/arm/norm_kernel.cpp | 36 - .../operators/kernel/arm/one_hot_kernel.cpp | 61 - .../src/operators/kernel/arm/pad2d_kernel.cpp | 45 - .../arm/polygon_box_transform_kernel.cpp | 38 - .../src/operators/kernel/arm/pool_kernel.cpp | 36 - .../src/operators/kernel/arm/prelu_kernel.cpp | 122 - .../operators/kernel/arm/prior_box_kernel.cpp | 36 - .../operators/kernel/arm/proposal_kernel.cpp | 36 - .../kernel/arm/psroi_pool_kernel.cpp | 36 - .../operators/kernel/arm/quantize_kernel.cpp | 221 - .../operators/kernel/arm/reshape2_kernel.cpp | 36 - .../operators/kernel/arm/reshape_kernel.cpp | 36 - .../operators/kernel/arm/resize_kernel.cpp | 124 - .../kernel/arm/roi_perspective_kernel.cpp | 291 - .../src/operators/kernel/arm/scale_kernel.cpp | 88 - .../kernel/arm/sequence_expand_kernel.cpp | 115 - .../kernel/arm/sequence_pool_kernel.cpp | 215 - .../kernel/arm/sequence_softmax_kernel.cpp | 44 - .../src/operators/kernel/arm/shape_kernel.cpp | 36 - .../src/operators/kernel/arm/slice_kernel.cpp | 86 - .../operators/kernel/arm/softmax_kernel.cpp | 38 - .../src/operators/kernel/arm/split_kernel.cpp | 36 - .../src/operators/kernel/arm/sum_kernel.cpp | 37 - .../arm/tensor_array_read_write_kernel.cpp | 66 - .../src/operators/kernel/arm/top_k_kernel.cpp | 68 - .../kernel/arm/transpose2_kernel.cpp | 146 - .../operators/kernel/arm/transpose_kernel.cpp | 35 - .../src/operators/kernel/arm/while_kernel.cpp | 128 - mobile/src/operators/kernel/assign_kernel.h | 53 - .../operators/kernel/assign_value_kernel.h | 53 - .../src/operators/kernel/batchnorm_kernel.h | 36 - .../kernel/beam_search_decode_kernel.h | 58 - .../src/operators/kernel/beam_search_kernel.h | 74 - .../operators/kernel/bilinear_interp_kernel.h | 38 - .../src/operators/kernel/box_coder_kernel.h | 38 - .../central-arm-func/activation_arm_func.h | 107 - .../central-arm-func/batchnorm_arm_func.h | 83 - .../bilinear_interp_arm_func.h | 91 - .../central-arm-func/box_coder_arm_func.h | 142 - .../kernel/central-arm-func/concat_arm_func.h | 90 - .../central-arm-func/conv_add_arm_func.h | 151 - .../conv_add_bn_relu_arm_func.h | 143 - .../central-arm-func/conv_add_relu_arm_func.h | 154 - .../kernel/central-arm-func/conv_arm_func.cpp | 379 -- .../kernel/central-arm-func/conv_arm_func.h | 58 - .../conv_bn_add_relu_arm_func.h | 148 - .../central-arm-func/conv_bn_relu_arm_func.h | 146 - .../conv_transpose_arm_func.h | 111 - .../kernel/central-arm-func/crf_arm_func.h | 118 - .../density_prior_box_arm_func.h | 161 - .../dwconv_bn_relu_arm_func.h | 144 - .../elementwise_add_arm_func.h | 78 - .../elementwise_mul_arm_func.h | 45 - .../elementwise_sub_arm_func.h | 65 - .../central-arm-func/flatten_arm_func.h | 50 - .../central-arm-func/fusion_fc_arm_func.h | 75 - .../kernel/central-arm-func/gru_arm_func.h | 107 - .../central-arm-func/gru_unit_arm_func.h | 72 - .../central-arm-func/increment_arm_func.h | 39 - .../kernel/central-arm-func/lookup_arm_func.h | 58 - .../kernel/central-arm-func/lrn_arm_func.h | 47 - .../kernel/central-arm-func/mul_arm_func.h | 59 - .../multiclass_nms_arm_func.h | 307 - .../kernel/central-arm-func/norm_arm_func.h | 106 - .../polygon_box_transform_arm_func.h | 53 - .../kernel/central-arm-func/pool_arm_func.h | 91 - .../central-arm-func/prior_box_arm_func.h | 199 - .../central-arm-func/reshape2_arm_func.h | 59 - .../central-arm-func/reshape_arm_func.h | 56 - .../kernel/central-arm-func/shape_arm_func.h | 38 - .../central-arm-func/softmax_arm_func.h | 92 - .../kernel/central-arm-func/split_arm_func.h | 86 - .../kernel/central-arm-func/sum_arm_func.h | 153 - .../central-arm-func/transpose_arm_func.h | 70 - .../operators/kernel/cl/batchnorm_kernel.cpp | 111 - .../kernel/cl/bilinear_interp_kernel.cpp | 85 - .../operators/kernel/cl/box_coder_kernel.cpp | 78 - .../kernel/cl/cl-kernel-func/conv_func.cpp | 1140 ---- .../kernel/cl/cl-kernel-func/conv_func.h | 89 - .../cl/cl-kernel-func/instancenorm_func.cpp | 78 - .../cl/cl-kernel-func/instancenorm_func.h | 28 - .../kernel/cl/cl_kernel/batchnorm_kernel.cl | 37 - .../cl/cl_kernel/bilinear_interp_kernel.cl | 87 - .../kernel/cl/cl_kernel/box_coder_kernel.cl | 147 - .../kernel/cl/cl_kernel/channel_add_kernel.cl | 51 - .../operators/kernel/cl/cl_kernel/cl_common.h | 34 - .../kernel/cl/cl_kernel/concat_kernel.cl | 291 - .../kernel/cl/cl_kernel/conv_kernel.cl | 15 - .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 2836 --------- .../cl/cl_kernel/conv_transpose_kernel.cl | 553 -- .../cl/cl_kernel/density_prior_box_kernel.cl | 114 - .../depthwise_conv_add_bn_relu_kernel.cl | 18 - .../cl/cl_kernel/depthwise_conv_kernel.cl | 15 - .../kernel/cl/cl_kernel/dropout_kernel.cl | 42 - .../cl/cl_kernel/elementwise_add_kernel.cl | 27 - .../cl/cl_kernel/elementwise_mul_kernel.cl | 150 - .../cl/cl_kernel/elementwise_sub_kernel.cl | 27 - .../kernel/cl/cl_kernel/exp_kernel.cl | 34 - .../operators/kernel/cl/cl_kernel/expend.cl | 159 - .../kernel/cl/cl_kernel/feed_kernel.cl | 110 - .../kernel/cl/cl_kernel/fetch_kernel.cl | 104 - .../kernel/cl/cl_kernel/flatten2_kernel.cl | 48 - .../cl/cl_kernel/grid_sampler_kernel.cl | 99 - .../cl/cl_kernel/instancenorm_kernel.cl | 126 - .../kernel/cl/cl_kernel/leakyrelu_kernel.cl | 38 - .../kernel/cl/cl_kernel/lrn_kernel.cl | 136 - .../cl/cl_kernel/nearest_interp_kernel.cl | 37 - .../kernel/cl/cl_kernel/pad2d_kernel.cl | 57 - .../cl/cl_kernel/pixel_shuffle_kernel.cl | 114 - .../kernel/cl/cl_kernel/pool_kernel.cl | 95 - .../kernel/cl/cl_kernel/pre_post_kernel.cl | 22 - .../kernel/cl/cl_kernel/prior_box_kernel.cl | 129 - .../src/operators/kernel/cl/cl_kernel/relu.cl | 58 - .../operators/kernel/cl/cl_kernel/relu6.cl | 32 - .../operators/kernel/cl/cl_kernel/reshape.cl | 202 - .../kernel/cl/cl_kernel/scale_kernel.cl | 35 - .../operators/kernel/cl/cl_kernel/sigmoid.cl | 34 - .../kernel/cl/cl_kernel/slice_kernel.cl | 77 - .../operators/kernel/cl/cl_kernel/softmax.cl | 92 - .../kernel/cl/cl_kernel/tanh_kernel.cl | 31 - .../kernel/cl/cl_kernel/transpose_kernel.cl | 169 - .../src/operators/kernel/cl/concat_kernel.cpp | 196 - .../kernel/cl/conv_add_bn_relu_kernel.cpp | 271 - .../operators/kernel/cl/conv_add_kernel.cpp | 167 - .../kernel/cl/conv_add_relu_kernel.cpp | 181 - .../kernel/cl/conv_bn_add_relu_kernel.cpp | 184 - .../kernel/cl/conv_bn_relu_kernel.cpp | 208 - .../src/operators/kernel/cl/conv_kernel.cpp | 164 - .../operators/kernel/cl/conv_relu_kernel.cpp | 153 - .../kernel/cl/conv_transpose_kernel.cpp | 77 - .../kernel/cl/density_prior_box_kernel.cpp | 160 - .../kernel/cl/depthwise_conv_kernel.cpp | 96 - .../operators/kernel/cl/dropout_kernel.cpp | 59 - .../kernel/cl/dwconv_bn_relu_kernel.cpp | 176 - .../kernel/cl/elementwise_add_kernel.cpp | 129 - .../kernel/cl/elementwise_mul_kernel.cpp | 221 - .../kernel/cl/elementwise_sub_kernel.cpp | 75 - mobile/src/operators/kernel/cl/exp_kernel.cpp | 52 - .../src/operators/kernel/cl/expand_kernel.cpp | 130 - .../src/operators/kernel/cl/feed_kernel.cpp | 89 - .../src/operators/kernel/cl/fetch_kernel.cpp | 116 - .../operators/kernel/cl/flatten2_kernel.cpp | 79 - .../operators/kernel/cl/fusion_fc_kernel.cpp | 123 - mobile/src/operators/kernel/cl/gen_code.py | 208 - .../kernel/cl/grid_sampler_kernel.cpp | 66 - .../kernel/cl/instancenorm_kernel.cpp | 51 - .../kernel/cl/instancenorm_relu_kernel.cpp | 51 - .../operators/kernel/cl/leakyrelu_kernel.cpp | 59 - mobile/src/operators/kernel/cl/lrn_kernel.cpp | 79 - mobile/src/operators/kernel/cl/mul_kernel.cpp | 88 - .../kernel/cl/multiclass_nms_kernel.cpp | 340 - .../kernel/cl/nearest_interp_kernel.cpp | 73 - .../src/operators/kernel/cl/pad2d_kernel.cpp | 94 - .../kernel/cl/pixel_shuffle_kernel.cpp | 80 - .../src/operators/kernel/cl/pool_kernel.cpp | 107 - .../operators/kernel/cl/prior_box_kernel.cpp | 216 - .../src/operators/kernel/cl/relu6_kernel.cpp | 53 - .../src/operators/kernel/cl/relu_kernel.cpp | 72 - .../operators/kernel/cl/reshape2_kernel.cpp | 150 - .../operators/kernel/cl/reshape_kernel.cpp | 106 - .../src/operators/kernel/cl/scale_kernel.cpp | 62 - .../operators/kernel/cl/sigmoid_kernel.cpp | 50 - .../src/operators/kernel/cl/slice_kernel.cpp | 64 - .../operators/kernel/cl/softmax_kernel.cpp | 65 - .../src/operators/kernel/cl/split_kernel.cpp | 116 - .../src/operators/kernel/cl/tanh_kernel.cpp | 51 - .../operators/kernel/cl/transpose2_kernel.cpp | 219 - .../operators/kernel/cl/transpose_kernel.cpp | 134 - mobile/src/operators/kernel/compare_kernel.h | 32 - mobile/src/operators/kernel/concat_kernel.h | 37 - .../kernel/conditional_block_kernel.h | 70 - .../src/operators/kernel/conv_add_bn_kernel.h | 44 - .../kernel/conv_add_bn_relu_kernel.h | 49 - mobile/src/operators/kernel/conv_add_kernel.h | 49 - .../operators/kernel/conv_add_relu_kernel.h | 44 - .../kernel/conv_bn_add_relu_kernel.h | 44 - mobile/src/operators/kernel/conv_bn_kernel.h | 44 - .../operators/kernel/conv_bn_relu_kernel.h | 48 - mobile/src/operators/kernel/conv_kernel.h | 41 - .../src/operators/kernel/conv_relu_kernel.h | 42 - .../operators/kernel/conv_transpose_kernel.h | 39 - mobile/src/operators/kernel/crf_kernel.h | 37 - .../operators/kernel/deconv_add_bn_kernel.h | 39 - .../kernel/deconv_add_bn_relu_kernel.h | 39 - .../src/operators/kernel/deconv_add_kernel.h | 39 - .../operators/kernel/deconv_add_relu_kernel.h | 39 - .../operators/kernel/deconv_bn_relu_kernel.h | 39 - .../src/operators/kernel/deconv_relu_kernel.h | 39 - .../src/operators/kernel/dequant_bn_kernel.h | 48 - .../src/operators/kernel/dequantize_kernel.h | 36 - .../src/operators/kernel/detection_kernel.h | 232 - mobile/src/operators/kernel/dropout_kernel.h | 35 - .../operators/kernel/dwconv_bn_relu_kernel.h | 44 - .../operators/kernel/elementwise_add_kernel.h | 39 - .../kernel/elementwise_add_relu_kernel.h | 38 - .../operators/kernel/elementwise_mul_kernel.h | 36 - .../operators/kernel/elementwise_sub_kernel.h | 38 - mobile/src/operators/kernel/exp_kernel.h | 24 - mobile/src/operators/kernel/expand_kernel.h | 28 - mobile/src/operators/kernel/fc_relu_kernel.h | 37 - mobile/src/operators/kernel/feed_kernel.h | 32 - mobile/src/operators/kernel/fetch_kernel.h | 34 - mobile/src/operators/kernel/flatten2_kernel.h | 28 - mobile/src/operators/kernel/flatten_kernel.h | 37 - .../kernel/fpga/KD/conv_add_bn_kernel.cpp | 47 - .../kernel/fpga/KD/conv_add_kernel.cpp | 34 - .../kernel/fpga/KD/conv_add_relu_kernel.cpp | 34 - .../kernel/fpga/KD/conv_bn_kernel.cpp | 69 - .../kernel/fpga/KD/conv_bn_relu_kernel.cpp | 76 - .../fpga/KD/elementwise_add_relu_kernel.cpp | 60 - .../operators/kernel/fpga/KD/feed_kernel.cpp | 65 - .../operators/kernel/fpga/KD/fetch_kernel.cpp | 55 - .../kernel/fpga/KD/fusion_fc_kernel.cpp | 56 - .../operators/kernel/fpga/KD/pool_kernel.cpp | 62 - .../kernel/fpga/KD/softmax_kernel.cpp | 55 - .../fpga/V1/anchor_generator_kernel.cpp | 88 - .../kernel/fpga/V1/concat_kernel.cpp | 69 - .../kernel/fpga/V1/conv_add_bn_kernel.cpp | 86 - .../fpga/V1/conv_add_bn_relu_kernel.cpp | 100 - .../kernel/fpga/V1/conv_add_kernel.cpp | 63 - .../kernel/fpga/V1/conv_add_relu_kernel.cpp | 63 - .../kernel/fpga/V1/conv_bn_kernel.cpp | 75 - .../kernel/fpga/V1/conv_bn_relu_kernel.cpp | 85 - .../operators/kernel/fpga/V1/conv_kernel.cpp | 56 - .../kernel/fpga/V1/conv_transpose_kernel.cpp | 89 - .../kernel/fpga/V1/deconv_add_bn_kernel.cpp | 90 - .../fpga/V1/deconv_add_bn_relu_kernel.cpp | 91 - .../kernel/fpga/V1/deconv_add_kernel.cpp | 90 - .../kernel/fpga/V1/deconv_add_relu_kernel.cpp | 91 - .../kernel/fpga/V1/deconv_bn_relu_kernel.cpp | 108 - .../kernel/fpga/V1/dropout_kernel.cpp | 34 - .../kernel/fpga/V1/elementwise_add_kernel.cpp | 191 - .../fpga/V1/elementwise_add_relu_kernel.cpp | 72 - .../kernel/fpga/V1/elementwise_mul_kernel.cpp | 93 - .../operators/kernel/fpga/V1/feed_kernel.cpp | 108 - .../operators/kernel/fpga/V1/fetch_kernel.cpp | 127 - .../kernel/fpga/V1/fusion_fc_kernel.cpp | 74 - .../kernel/fpga/V1/fusion_fc_relu_kernel.cpp | 75 - .../operators/kernel/fpga/V1/pad2d_kernel.cpp | 60 - .../operators/kernel/fpga/V1/pool_kernel.cpp | 104 - .../kernel/fpga/V1/proposal_kernel.cpp | 567 -- .../kernel/fpga/V1/psroi_pool_kernel.cpp | 284 - .../operators/kernel/fpga/V1/relu_kernel.cpp | 35 - .../kernel/fpga/V1/reshape2_kernel.cpp | 127 - .../kernel/fpga/V1/reshape_kernel.cpp | 40 - .../kernel/fpga/V1/roialign_pool_kernel.cpp | 296 - .../kernel/fpga/V1/sigmoid_kernel.cpp | 54 - .../operators/kernel/fpga/V1/slice_kernel.cpp | 63 - .../kernel/fpga/V1/softmax_kernel.cpp | 138 - .../operators/kernel/fpga/V1/split_kernel.cpp | 74 - .../operators/kernel/fpga/V1/tanh_kernel.cpp | 79 - .../kernel/fpga/V1/transpose2_kernel.cpp | 55 - .../fpga/V2/anchor_generator_kernel.cpp | 87 - .../kernel/fpga/V2/concat_kernel.cpp | 78 - .../kernel/fpga/V2/conv_add_bn_kernel.cpp | 89 - .../fpga/V2/conv_add_bn_relu_kernel.cpp | 104 - .../kernel/fpga/V2/conv_add_kernel.cpp | 64 - .../kernel/fpga/V2/conv_add_relu_kernel.cpp | 64 - .../kernel/fpga/V2/conv_bn_kernel.cpp | 76 - .../kernel/fpga/V2/conv_bn_relu_kernel.cpp | 93 - .../operators/kernel/fpga/V2/conv_kernel.cpp | 58 - .../kernel/fpga/V2/conv_transpose_kernel.cpp | 94 - .../kernel/fpga/V2/deconv_add_bn_kernel.cpp | 98 - .../fpga/V2/deconv_add_bn_relu_kernel.cpp | 98 - .../kernel/fpga/V2/deconv_add_kernel.cpp | 98 - .../kernel/fpga/V2/deconv_add_relu_kernel.cpp | 93 - .../kernel/fpga/V2/deconv_bn_relu_kernel.cpp | 114 - .../kernel/fpga/V2/dropout_kernel.cpp | 34 - .../kernel/fpga/V2/elementwise_add_kernel.cpp | 94 - .../fpga/V2/elementwise_add_relu_kernel.cpp | 96 - .../kernel/fpga/V2/elementwise_mul_kernel.cpp | 93 - .../operators/kernel/fpga/V2/feed_kernel.cpp | 64 - .../operators/kernel/fpga/V2/fetch_kernel.cpp | 118 - .../kernel/fpga/V2/fusion_fc_kernel.cpp | 75 - .../kernel/fpga/V2/fusion_fc_relu_kernel.cpp | 76 - .../operators/kernel/fpga/V2/pool_kernel.cpp | 106 - .../kernel/fpga/V2/proposal_kernel.cpp | 452 -- .../kernel/fpga/V2/psroi_pool_kernel.cpp | 188 - .../operators/kernel/fpga/V2/relu_kernel.cpp | 33 - .../kernel/fpga/V2/reshape2_kernel.cpp | 145 - .../kernel/fpga/V2/reshape_kernel.cpp | 40 - .../kernel/fpga/V2/roialign_pool_kernel.cpp | 296 - .../kernel/fpga/V2/sigmoid_kernel.cpp | 57 - .../operators/kernel/fpga/V2/slice_kernel.cpp | 87 - .../kernel/fpga/V2/softmax_kernel.cpp | 125 - .../operators/kernel/fpga/V2/split_kernel.cpp | 74 - .../operators/kernel/fpga/V2/tanh_kernel.cpp | 79 - .../kernel/fpga/V2/transpose2_kernel.cpp | 55 - .../src/operators/kernel/fusion_fc_kernel.h | 37 - .../operators/kernel/grid_sampler_kernel.h | 28 - mobile/src/operators/kernel/gru_kernel.h | 37 - mobile/src/operators/kernel/gru_unit_kernel.h | 35 - .../src/operators/kernel/im2sequence_kernel.h | 38 - .../src/operators/kernel/increment_kernel.h | 36 - .../operators/kernel/instancenorm_kernel.h | 37 - .../kernel/instancenorm_relu_kernel.h | 42 - mobile/src/operators/kernel/is_empty_kernel.h | 36 - mobile/src/operators/kernel/kernels.h | 36 - mobile/src/operators/kernel/logical_kernel.h | 42 - mobile/src/operators/kernel/lookup_kernel.h | 37 - mobile/src/operators/kernel/lrn_kernel.h | 181 - mobile/src/operators/kernel/mul_kernel.h | 38 - .../operators/kernel/multiclass_nms_kernel.h | 37 - .../operators/kernel/nearest_interp_kernel.h | 38 - mobile/src/operators/kernel/norm_kernel.h | 36 - mobile/src/operators/kernel/one_hot_kernel.h | 51 - mobile/src/operators/kernel/pad2d_kernel.h | 54 - .../operators/kernel/pixel_shuffle_kernel.h | 44 - .../kernel/polygon_box_transform_kernel.h | 36 - mobile/src/operators/kernel/pool_kernel.h | 35 - mobile/src/operators/kernel/prelu_kernel.h | 30 - .../src/operators/kernel/prior_box_kernel.h | 120 - mobile/src/operators/kernel/quantize_kernel.h | 36 - mobile/src/operators/kernel/range_kernel.cpp | 49 - mobile/src/operators/kernel/range_kernel.h | 71 - .../operators/kernel/reduce_prod_kernel.cpp | 65 - .../src/operators/kernel/reduce_prod_kernel.h | 65 - mobile/src/operators/kernel/reshape2_kernel.h | 36 - mobile/src/operators/kernel/reshape_kernel.h | 80 - mobile/src/operators/kernel/resize_kernel.h | 82 - mobile/src/operators/kernel/scale_kernel.h | 35 - .../src/operators/kernel/sequence_kernels.h | 36 - mobile/src/operators/kernel/shape_kernel.h | 37 - mobile/src/operators/kernel/slice_kernel.h | 31 - mobile/src/operators/kernel/softmax_kernel.h | 36 - mobile/src/operators/kernel/split_kernel.h | 37 - mobile/src/operators/kernel/sum_kernel.h | 35 - mobile/src/operators/kernel/tanh_kernel.h | 37 - .../kernel/tensor_array_read_write_kernel.h | 32 - .../src/operators/kernel/transpose2_kernel.h | 37 - .../src/operators/kernel/transpose_kernel.h | 37 - mobile/src/operators/kernel/while_kernel.h | 47 - mobile/src/operators/lod_reset_op.cpp | 41 - mobile/src/operators/lod_reset_op.h | 32 - mobile/src/operators/logical_op.cpp | 69 - mobile/src/operators/logical_op.h | 42 - mobile/src/operators/lookup_op.cpp | 66 - mobile/src/operators/lookup_op.h | 46 - mobile/src/operators/lrn_op.cpp | 39 - mobile/src/operators/lrn_op.h | 46 - mobile/src/operators/math/activation.h | 187 - .../math/depthwise/faster_depthwise_conv3x3.h | 34 - .../depthwise/faster_depthwise_conv3x3p1.cpp | 2011 ------ .../src/operators/math/depthwise_conv3x3.cpp | 1062 --- mobile/src/operators/math/depthwise_conv3x3.h | 47 - .../operators/math/depthwise_conv3x3_int8.cpp | 1660 ----- .../src/operators/math/depthwise_conv5x5.cpp | 1106 ---- mobile/src/operators/math/depthwise_conv5x5.h | 47 - .../operators/math/depthwise_conv5x5_int8.cpp | 1041 --- mobile/src/operators/math/element_wise.h | 396 -- .../operators/math/elementwise_op_function.h | 178 - mobile/src/operators/math/gemm.cpp | 3807 ----------- mobile/src/operators/math/gemm.h | 492 -- mobile/src/operators/math/gemm/cblas.cc | 50 - mobile/src/operators/math/gemm/cblas.h | 32 - mobile/src/operators/math/gemm/executor.h | 266 - mobile/src/operators/math/gemm/gemm1x1s1.cpp | 2223 ------- mobile/src/operators/math/gemm/gemm1x1s1.h | 81 - mobile/src/operators/math/gemm/gemm_kernel.h | 792 --- mobile/src/operators/math/gemm/pack_kernel.h | 801 --- mobile/src/operators/math/gemm/strategy.h | 120 - mobile/src/operators/math/gemm_int8.cpp | 2077 ------ mobile/src/operators/math/gemm_omp_int8.cpp | 453 -- mobile/src/operators/math/gpc.cpp | 2142 ------- mobile/src/operators/math/gpc.h | 222 - mobile/src/operators/math/gru_compute.cpp | 56 - mobile/src/operators/math/gru_compute.h | 40 - mobile/src/operators/math/gru_cpu_kernel.h | 203 - mobile/src/operators/math/im2col.cpp | 668 -- mobile/src/operators/math/im2col.h | 129 - mobile/src/operators/math/math.h | 342 - mobile/src/operators/math/math_function.cpp | 176 - mobile/src/operators/math/math_function.h | 62 - .../src/operators/math/math_function_int8.cpp | 109 - mobile/src/operators/math/pad.cpp | 54 - mobile/src/operators/math/pad.h | 32 - mobile/src/operators/math/poly_util.cpp | 120 - mobile/src/operators/math/poly_util.h | 70 - mobile/src/operators/math/pooling.cpp | 82 - mobile/src/operators/math/pooling.h | 199 - mobile/src/operators/math/pooling2x2.cpp | 791 --- mobile/src/operators/math/pooling3x3.cpp | 1317 ---- mobile/src/operators/math/quantize.h | 108 - .../operators/math/selected_rows_functor.h | 174 - mobile/src/operators/math/sequence2batch.cpp | 60 - mobile/src/operators/math/sequence2batch.h | 169 - .../operators/math/slidingwindow_conv3x3.cpp | 5668 ----------------- .../operators/math/slidingwindow_conv3x3.h | 51 - .../operators/math/slidingwindow_utils.cpp | 365 -- .../src/operators/math/slidingwindow_utils.h | 159 - mobile/src/operators/math/softmax.cpp | 157 - mobile/src/operators/math/softmax.h | 42 - mobile/src/operators/math/transform.h | 55 - mobile/src/operators/math/vol2col.cpp | 147 - mobile/src/operators/math/vol2col.h | 94 - .../math/winograd/winograd_transform.h | 42 - .../math/winograd/winograd_transform_f6k3.cpp | 1681 ----- mobile/src/operators/mul_op.cpp | 67 - mobile/src/operators/mul_op.h | 46 - mobile/src/operators/multiclass_nms_op.cpp | 50 - mobile/src/operators/multiclass_nms_op.h | 50 - mobile/src/operators/nearest_interp_op.cpp | 75 - mobile/src/operators/nearest_interp_op.h | 50 - mobile/src/operators/norm_op.cpp | 51 - mobile/src/operators/norm_op.h | 47 - mobile/src/operators/one_hot_op.cpp | 43 - mobile/src/operators/one_hot_op.h | 31 - mobile/src/operators/op_param.cpp | 98 - mobile/src/operators/op_param.h | 3816 ----------- mobile/src/operators/pad2d_op.cpp | 46 - mobile/src/operators/pad2d_op.h | 32 - mobile/src/operators/pixel_shuffle_op.cpp | 43 - mobile/src/operators/pixel_shuffle_op.h | 47 - .../operators/polygon_box_transform_op.cpp | 45 - .../src/operators/polygon_box_transform_op.h | 56 - mobile/src/operators/pool_op.cpp | 73 - mobile/src/operators/pool_op.h | 46 - mobile/src/operators/prelu_op.cpp | 40 - mobile/src/operators/prelu_op.h | 49 - mobile/src/operators/prior_box_op.cpp | 101 - mobile/src/operators/prior_box_op.h | 34 - mobile/src/operators/quantize_op.cpp | 39 - mobile/src/operators/quantize_op.h | 45 - mobile/src/operators/range_op.cpp | 45 - mobile/src/operators/range_op.h | 33 - mobile/src/operators/reduce_prod_op.cpp | 86 - mobile/src/operators/reduce_prod_op.h | 33 - mobile/src/operators/reshape2_op.cpp | 100 - mobile/src/operators/reshape2_op.h | 53 - mobile/src/operators/reshape_op.cpp | 45 - mobile/src/operators/reshape_op.h | 49 - mobile/src/operators/resize_op.cpp | 36 - mobile/src/operators/resize_op.h | 48 - mobile/src/operators/scale_op.cpp | 38 - mobile/src/operators/scale_op.h | 49 - .../sequence_ops/sequence_expand_op.cpp | 56 - .../sequence_ops/sequence_expand_op.h | 47 - .../sequence_ops/sequence_pool_op.cpp | 38 - .../operators/sequence_ops/sequence_pool_op.h | 46 - .../sequence_ops/sequence_softmax_op.cpp | 39 - .../sequence_ops/sequence_softmax_op.h | 47 - mobile/src/operators/shape_op.cpp | 38 - mobile/src/operators/shape_op.h | 47 - mobile/src/operators/slice_op.cpp | 109 - mobile/src/operators/slice_op.h | 49 - mobile/src/operators/softmax_op.cpp | 40 - mobile/src/operators/softmax_op.h | 45 - mobile/src/operators/split_op.cpp | 93 - mobile/src/operators/split_op.h | 46 - mobile/src/operators/sum_op.cpp | 67 - mobile/src/operators/sum_op.h | 49 - mobile/src/operators/top_k_op.cpp | 44 - mobile/src/operators/top_k_op.h | 45 - mobile/src/operators/transpose2_op.cpp | 121 - mobile/src/operators/transpose2_op.h | 52 - mobile/src/operators/transpose_op.cpp | 62 - mobile/src/operators/transpose_op.h | 48 - mobile/src/pass/memory_optimize.cpp | 170 - mobile/src/pass/memory_optimize.h | 62 - mobile/src/pass/memory_optimize_cl.cpp | 270 - mobile/src/pass/memory_optimize_cl.h | 75 - mobile/src/pass/model_obfuscate.cpp | 36 - mobile/src/pass/model_obfuscate.h | 36 - mobile/src/pass/pass_base.h | 27 - mobile/src/protobuf-c/protobuf-c.cpp | 2249 ------- mobile/src/protobuf-c/protobuf-c.h | 962 --- mobile/test/CMakeLists.txt | 578 -- mobile/test/common/test_enforce.cpp | 21 - mobile/test/common/test_gemm_accuracy.cpp | 131 - .../test/common/test_gemm_int8_accuracy.cpp | 346 - mobile/test/common/test_gemm_perf.cpp | 164 - mobile/test/common/test_lib_size.cpp | 21 - mobile/test/common/test_lib_size.h | 97 - mobile/test/common/test_log.cpp | 35 - mobile/test/common/test_openmp.cpp | 29 - mobile/test/executor_for_test.h | 141 - mobile/test/executor_for_test_opencl.h | 163 - mobile/test/fpga/test_concat_op.cpp | 87 - mobile/test/fpga/test_densebox_combine.cpp | 49 - mobile/test/fpga/test_format_data.cpp | 93 - mobile/test/fpga/test_marker.cpp | 125 - mobile/test/fpga/test_marker2.cpp | 181 - mobile/test/fpga/test_marker_api.cpp | 241 - mobile/test/fpga/test_mobilenet_api.cpp | 158 - mobile/test/fpga/test_pe.cpp | 111 - mobile/test/fpga/test_resnet50.cpp | 140 - mobile/test/fpga/test_rfcn.cpp | 152 - mobile/test/fpga/test_rfcn_api.cpp | 172 - mobile/test/fpga/test_ssd.cpp | 46 - mobile/test/fpga/test_tensor_quant.cpp | 45 - mobile/test/fpga/test_yolo_api.cpp | 158 - mobile/test/framework/test_inference_api.cpp | 62 - mobile/test/framework/test_load.cpp | 34 - mobile/test/framework/test_load_memory.cpp | 68 - .../test_load_memory_inference_api.cpp | 80 - mobile/test/framework/test_optimize.cpp | 33 - mobile/test/net/test_alexnet.cpp | 59 - mobile/test/net/test_benchmark.cpp | 79 - mobile/test/net/test_eng.cpp | 50 - mobile/test/net/test_genet_combine.cpp | 51 - mobile/test/net/test_gesture.cpp | 97 - mobile/test/net/test_googlenet.cpp | 85 - mobile/test/net/test_googlenet_quali.cpp | 55 - mobile/test/net/test_googlenetv1_combine.cpp | 60 - mobile/test/net/test_inceptionv4.cpp | 59 - mobile/test/net/test_inference_ercy.cpp | 129 - mobile/test/net/test_inference_imfix.cpp | 113 - mobile/test/net/test_inference_m2fm.cpp | 130 - mobile/test/net/test_inference_pre_post.cpp | 84 - mobile/test/net/test_mobilenet+ssd.cpp | 48 - mobile/test/net/test_mobilenet.cpp | 60 - mobile/test/net/test_mobilenet_025_fssd.cpp | 61 - mobile/test/net/test_mobilenet_GPU.cpp | 140 - mobile/test/net/test_mobilenet_combine.cpp | 59 - mobile/test/net/test_mobilenet_male2fe.cpp | 66 - .../test/net/test_multi_inference_predict.cpp | 104 - mobile/test/net/test_net.cpp | 277 - mobile/test/net/test_net_benchmark.cpp | 65 - mobile/test/net/test_net_multi_feed.cpp | 221 - mobile/test/net/test_net_performance.cpp | 198 - mobile/test/net/test_nlp.cpp | 94 - mobile/test/net/test_ocr.cpp | 108 - mobile/test/net/test_op_in_net.cpp | 125 - mobile/test/net/test_resnet.cpp | 73 - mobile/test/net/test_squeezenet.cpp | 49 - mobile/test/net/test_super.cpp | 119 - mobile/test/net/test_vgg16ssd.cpp | 46 - mobile/test/net/test_wrap.cpp | 65 - mobile/test/net/test_yolo.cpp | 50 - mobile/test/net/test_yolo_combined.cpp | 53 - mobile/test/net/test_yologpu.cpp | 190 - mobile/test/operators/test_batchnorm_op.cpp | 122 - mobile/test/operators/test_box_coder_op.cpp | 196 - mobile/test/operators/test_cast_op.cpp | 126 - mobile/test/operators/test_concat_op.cpp | 136 - .../test/operators/test_conv_add_relu_op.cpp | 45 - .../test/operators/test_conv_bn_relu_op.cpp | 172 - mobile/test/operators/test_conv_gpu.cpp | 199 - mobile/test/operators/test_conv_op.cpp | 358 -- .../test/operators/test_depthwise_conv_op.cpp | 45 - mobile/test/operators/test_dequantize_op.cpp | 76 - .../test/operators/test_dwconv_bn_relu_op.cpp | 145 - .../operators/test_elementwise_add_op.cpp | 62 - .../operators/test_elementwise_sub_op.cpp | 157 - mobile/test/operators/test_expend_op.cpp | 55 - .../test/operators/test_fill_constant_op.cpp | 112 - .../test_fusion_conv_add_bn_relu_op.cpp | 63 - mobile/test/operators/test_fusion_fc_op.cpp | 166 - mobile/test/operators/test_gru_op.cpp | 100 - mobile/test/operators/test_im2sequence_op.cpp | 137 - mobile/test/operators/test_increment_op.cpp | 75 - mobile/test/operators/test_is_empty_op.cpp | 69 - mobile/test/operators/test_leaky_relu_op.cpp | 80 - mobile/test/operators/test_less_than_op.cpp | 122 - mobile/test/operators/test_log_op.cpp | 80 - mobile/test/operators/test_logical_and_op.cpp | 84 - mobile/test/operators/test_logical_not_op.cpp | 76 - mobile/test/operators/test_logical_or_op.cpp | 84 - mobile/test/operators/test_logical_xor_op.cpp | 86 - mobile/test/operators/test_lrn_op.cpp | 83 - mobile/test/operators/test_mul_op.cpp | 102 - .../test/operators/test_multiclass_nms_op.cpp | 162 - .../test_polygon_box_transform_op.cpp | 125 - mobile/test/operators/test_pool_op.cpp | 231 - mobile/test/operators/test_prelu_op.cpp | 58 - mobile/test/operators/test_prior_box_op.cpp | 152 - mobile/test/operators/test_quantize_op.cpp | 153 - mobile/test/operators/test_relu6_op.cpp | 83 - mobile/test/operators/test_relu_op.cpp | 82 - mobile/test/operators/test_reshape2_op.cpp | 142 - mobile/test/operators/test_reshape_op.cpp | 47 - mobile/test/operators/test_resize_op.cpp | 47 - mobile/test/operators/test_scale_op.cpp | 18 - .../operators/test_sequence_expand_op.cpp | 97 - .../test/operators/test_sequence_pool_op.cpp | 293 - .../operators/test_sequence_softmax_op.cpp | 100 - mobile/test/operators/test_sigmoid_op.cpp | 80 - mobile/test/operators/test_slice_op.cpp | 18 - mobile/test/operators/test_softmax_op.cpp | 100 - mobile/test/operators/test_sum_op.cpp | 131 - mobile/test/operators/test_tanh_op.cpp | 81 - mobile/test/operators/test_topk_op.cpp | 139 - mobile/test/operators/test_transpose2_op.cpp | 143 - mobile/test/operators/test_transpose_op.cpp | 49 - mobile/test/test_helper.h | 147 - mobile/test/test_include.h | 39 - mobile/third_party/opencl/.gitinore | 1 - .../android-cmake/android.toolchain.cmake | 784 --- .../android-debug-script/push2android.sh | 42 - .../android-debug-script/run_on_android.sh | 37 - mobile/tools/arm-platform.cmake | 9 - mobile/tools/build.sh | 242 - mobile/tools/build_android_armv7.sh | 78 - mobile/tools/build_android_armv8.sh | 78 - mobile/tools/ci_build.sh | 270 - mobile/tools/ci_run_test.sh | 43 - mobile/tools/docker_build_fpga.sh | 7 - mobile/tools/ios-cmake/ios.toolchain.cmake | 216 - mobile/tools/net-detail.awk | 91 - mobile/tools/net.awk | 27 - mobile/tools/op.cmake | 770 --- .../tools/pre-commit.hooks/clang-format.hook | 23 - mobile/tools/pre-commit.hooks/clang-tidy.hook | 18 - mobile/tools/pre-commit.hooks/copyright.hook | 124 - mobile/tools/pre-commit.hooks/cpplint.hook | 13 - mobile/tools/prepare_images_and_models.sh | 20 - mobile/tools/profile_show.sh | 138 - mobile/tools/python/caffetools/run.py | 30 - mobile/tools/python/fluidtools/.gitignore | 6 - mobile/tools/python/fluidtools/run.py | 675 -- .../tools/python/fluidtools/run_multi_feed.py | 695 -- mobile/tools/python/fluidtools/test_wrap.py | 546 -- mobile/tools/python/imagetools/README.md | 24 - mobile/tools/python/imagetools/imagetools.py | 71 - mobile/tools/python/imagetools/img2nchw.py | 88 - mobile/tools/python/imagetools/img2nhwc.py | 34 - .../tools/python/imagetools/numpy2binary.py | 60 - mobile/tools/python/misc/.gitignore | 4 - mobile/tools/python/misc/fluidtools.py | 175 - mobile/tools/python/misc/ios-test-server.py | 126 - mobile/tools/python/misc/restore-git.py | 54 - .../python/misc/test-fluid-op-feature.py | 13 - mobile/tools/python/modeltools/.gitignore | 109 - .../tools/python/modeltools/core/__init__.py | 0 .../python/modeltools/core/framework.proto | 176 - .../python/modeltools/core/framework_pb2.py | 1141 ---- .../tools/python/modeltools/core/op_types.py | 93 - .../python/modeltools/mobilenet/__init__.py | 0 .../mobilenet/converter_mobilenet.py | 509 -- .../python/modeltools/mobilenet/swicher.py | 119 - .../tools/python/modeltools/tools/__init__.py | 0 .../modeltools/tools/float2halffloat.py | 70 - .../tools/python/modeltools/tools/loader.py | 11 - .../python/modeltools/tools/model_combine.py | 19 - .../python/modeltools/tools/model_reader.py | 30 - .../tools/python/modeltools/yolo/__init__.py | 0 .../tools/python/modeltools/yolo/mdl2fluid.py | 333 - .../tools/python/modeltools/yolo/swicher.py | 115 - mobile/tools/quantification/CMakeLists.txt | 12 - mobile/tools/quantification/README.md | 37 - mobile/tools/quantification/convert.cpp | 480 -- mobile/tools/quantification/scripts/run.py | 661 -- .../quantification/src/block_desc_local.cpp | 48 - .../quantification/src/block_desc_local.h | 56 - mobile/tools/quantification/src/enforce.h | 67 - .../tools/quantification/src/framework.pb-c.c | 1403 ---- .../tools/quantification/src/framework.pb-c.h | 579 -- .../tools/quantification/src/program_desc.cpp | 30 - .../tools/quantification/src/program_desc.h | 41 - mobile/tools/quantification/src/protobuf-c.c | 2098 ------ mobile/tools/quantification/src/protobuf-c.h | 921 --- mobile/tools/quantification/src/tensor_desc.h | 72 - mobile/tools/quantification/src/var_desc.h | 80 - mobile/tools/quantification/tune_n_fold.py | 24 - mobile/tools/shell/change_mobile_namespace.sh | 39 - mobile/tools/shell/check-bitcode.sh | 34 - mobile/tools/shell/check-filename.sh | 41 - .../tools/shell/generate-include/.gitignore | 2 - .../generate-include/check_include_diff.sh | 30 - mobile/tools/shell/generate-include/main.cpp | 6 - mobile/tools/shell/generate-include/parse.py | 21 - mobile/tools/shell/generate-include/run.sh | 9 - mobile/tools/shell/merge.sh | 60 - mobile/tools/shell/prune_static_library.sh | 41 - mobile/tools/shell/restore-private-repo.sh | 5 - .../tools/toolchains/arm-android-neon.cmake | 5 - .../tools/toolchains/arm-linux-gnueabi.cmake | 16 - .../toolchains/arm-linux-gnueabihf.cmake | 10 - 1029 files changed, 150652 deletions(-) delete mode 100644 mobile/.clang-format delete mode 100644 mobile/.clang-tidy delete mode 100644 mobile/.gitignore delete mode 100644 mobile/.pre-commit-config.yaml delete mode 100644 mobile/.travis.yml delete mode 100755 mobile/.travis/pre-commit-job.sh delete mode 100644 mobile/CMakeLists.txt delete mode 100644 mobile/CONTRIBUTING.md delete mode 100644 mobile/Dockerfile delete mode 100644 mobile/LICENSE delete mode 100644 mobile/README.md delete mode 100644 mobile/benchmark/arm_benchmark.md delete mode 100644 mobile/benchmark/metal_benchmark.md delete mode 100644 mobile/demo/ReadMe.md delete mode 100644 mobile/demo/getDemo.sh delete mode 100644 mobile/doc/build.md delete mode 100644 mobile/doc/design_doc.md delete mode 100644 mobile/doc/development_android.md delete mode 100644 mobile/doc/development_android_GPU.md delete mode 100644 mobile/doc/development_arm_linux.md delete mode 100644 mobile/doc/development_fpga.md delete mode 100644 mobile/doc/development_ios.md delete mode 100644 mobile/doc/quantification.md delete mode 100644 mobile/src/common/common.h delete mode 100644 mobile/src/common/enforce.h delete mode 100644 mobile/src/common/log.h delete mode 100644 mobile/src/common/threadpool.h delete mode 100644 mobile/src/common/type_define.h delete mode 100755 mobile/src/common/types.cpp delete mode 100644 mobile/src/common/types.h delete mode 100644 mobile/src/common/util.cpp delete mode 100644 mobile/src/common/util.h delete mode 100644 mobile/src/common/variant.h delete mode 100644 mobile/src/fpga/KD/alignment.h delete mode 100644 mobile/src/fpga/KD/context.hpp delete mode 100644 mobile/src/fpga/KD/dl_engine.cpp delete mode 100644 mobile/src/fpga/KD/dl_engine.hpp delete mode 100644 mobile/src/fpga/KD/float16.hpp delete mode 100644 mobile/src/fpga/KD/layout.hpp delete mode 100644 mobile/src/fpga/KD/llapi/bias_scale.cpp delete mode 100644 mobile/src/fpga/KD/llapi/bias_scale.h delete mode 100755 mobile/src/fpga/KD/llapi/config.h delete mode 100644 mobile/src/fpga/KD/llapi/filter.cpp delete mode 100644 mobile/src/fpga/KD/llapi/filter.h delete mode 100644 mobile/src/fpga/KD/llapi/image.cpp delete mode 100644 mobile/src/fpga/KD/llapi/image.h delete mode 100644 mobile/src/fpga/KD/llapi/zynqmp_api.cpp delete mode 100644 mobile/src/fpga/KD/llapi/zynqmp_api.h delete mode 100644 mobile/src/fpga/KD/pe.hpp delete mode 100644 mobile/src/fpga/KD/pe_params.hpp delete mode 100644 mobile/src/fpga/KD/pes/concat_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/conv_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/conv_process.hpp delete mode 100644 mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/elementwise_add_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/fully_connected_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/input_pe.hpp delete mode 100755 mobile/src/fpga/KD/pes/math_func_neon.h delete mode 100644 mobile/src/fpga/KD/pes/output_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/pooling_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/softmax_pe.cpp delete mode 100644 mobile/src/fpga/KD/pes/softmax_pe.hpp delete mode 100644 mobile/src/fpga/KD/shape.hpp delete mode 100644 mobile/src/fpga/KD/tensor.hpp delete mode 100644 mobile/src/fpga/KD/tensor_util.cpp delete mode 100644 mobile/src/fpga/KD/tensor_util.hpp delete mode 100644 mobile/src/fpga/V1/api.cpp delete mode 100644 mobile/src/fpga/V1/api.h delete mode 100644 mobile/src/fpga/V1/bias_scale.cpp delete mode 100755 mobile/src/fpga/V1/bias_scale.h delete mode 100644 mobile/src/fpga/V1/deconv_bias_scale.cpp delete mode 100644 mobile/src/fpga/V1/deconv_bias_scale.h delete mode 100644 mobile/src/fpga/V1/deconv_filter.cpp delete mode 100644 mobile/src/fpga/V1/deconv_filter.h delete mode 100644 mobile/src/fpga/V1/filter.cpp delete mode 100755 mobile/src/fpga/V1/filter.h delete mode 100644 mobile/src/fpga/V1/image.cpp delete mode 100644 mobile/src/fpga/V1/image.h delete mode 100644 mobile/src/fpga/V1/pe.cpp delete mode 100644 mobile/src/fpga/V2/api.cpp delete mode 100644 mobile/src/fpga/V2/api.h delete mode 100644 mobile/src/fpga/V2/bias_scale.cpp delete mode 100644 mobile/src/fpga/V2/bias_scale.h delete mode 100644 mobile/src/fpga/V2/deconv_bias_scale.cpp delete mode 100644 mobile/src/fpga/V2/deconv_bias_scale.h delete mode 100644 mobile/src/fpga/V2/deconv_filter.cpp delete mode 100644 mobile/src/fpga/V2/deconv_filter.h delete mode 100644 mobile/src/fpga/V2/filter.cpp delete mode 100644 mobile/src/fpga/V2/filter.h delete mode 100644 mobile/src/fpga/V2/image.cpp delete mode 100644 mobile/src/fpga/V2/image.h delete mode 100644 mobile/src/fpga/V2/pe.cpp delete mode 100644 mobile/src/fpga/common/config.h delete mode 100755 mobile/src/fpga/common/driver.cpp delete mode 100644 mobile/src/fpga/common/driver.h delete mode 100644 mobile/src/fpga/common/fpga_common.cpp delete mode 100755 mobile/src/fpga/common/fpga_common.h delete mode 100644 mobile/src/fpga/common/pe.h delete mode 100644 mobile/src/framework/CMakeLists.txt delete mode 100644 mobile/src/framework/attribute.cpp delete mode 100644 mobile/src/framework/attribute.h delete mode 100644 mobile/src/framework/cl/cl_deleter.h delete mode 100644 mobile/src/framework/cl/cl_engine.cpp delete mode 100644 mobile/src/framework/cl/cl_engine.h delete mode 100644 mobile/src/framework/cl/cl_half.cpp delete mode 100644 mobile/src/framework/cl/cl_half.h delete mode 100644 mobile/src/framework/cl/cl_helper.h delete mode 100644 mobile/src/framework/cl/cl_image.cpp delete mode 100644 mobile/src/framework/cl/cl_image.h delete mode 100644 mobile/src/framework/cl/cl_image_converter.cpp delete mode 100644 mobile/src/framework/cl/cl_image_converter.h delete mode 100644 mobile/src/framework/cl/cl_scope.h delete mode 100644 mobile/src/framework/cl/cl_tensor.h delete mode 100644 mobile/src/framework/cl/cl_tool.cpp delete mode 100644 mobile/src/framework/cl/cl_tool.h delete mode 100644 mobile/src/framework/context.cpp delete mode 100644 mobile/src/framework/context.h delete mode 100644 mobile/src/framework/data_layout.h delete mode 100644 mobile/src/framework/data_type.cpp delete mode 100644 mobile/src/framework/data_type.h delete mode 100644 mobile/src/framework/ddim.cpp delete mode 100644 mobile/src/framework/ddim.h delete mode 100644 mobile/src/framework/dim.h delete mode 100644 mobile/src/framework/executor.cpp delete mode 100644 mobile/src/framework/executor.h delete mode 100644 mobile/src/framework/framework.pb-c.cpp delete mode 100644 mobile/src/framework/framework.pb-c.h delete mode 100644 mobile/src/framework/framework.proto delete mode 100755 mobile/src/framework/load_ops.h delete mode 100644 mobile/src/framework/loader.cpp delete mode 100644 mobile/src/framework/loader.h delete mode 100644 mobile/src/framework/lod_tensor.cpp delete mode 100644 mobile/src/framework/lod_tensor.h delete mode 100644 mobile/src/framework/mixed_vector.h delete mode 100644 mobile/src/framework/op_info.h delete mode 100644 mobile/src/framework/op_kernel_type.h delete mode 100644 mobile/src/framework/op_proto_maker.h delete mode 100644 mobile/src/framework/op_registry.h delete mode 100644 mobile/src/framework/operator.cpp delete mode 100644 mobile/src/framework/operator.h delete mode 100644 mobile/src/framework/program/block_desc.cpp delete mode 100644 mobile/src/framework/program/block_desc.h delete mode 100644 mobile/src/framework/program/op_desc.cpp delete mode 100644 mobile/src/framework/program/op_desc.h delete mode 100644 mobile/src/framework/program/program-optimize/fusion_op_register.h delete mode 100644 mobile/src/framework/program/program-optimize/node.cpp delete mode 100644 mobile/src/framework/program/program-optimize/node.h delete mode 100644 mobile/src/framework/program/program-optimize/program_optimize.cpp delete mode 100644 mobile/src/framework/program/program-optimize/program_optimize.h delete mode 100644 mobile/src/framework/program/program.h delete mode 100644 mobile/src/framework/program/program_desc.cpp delete mode 100644 mobile/src/framework/program/program_desc.h delete mode 100644 mobile/src/framework/program/tensor_desc.h delete mode 100644 mobile/src/framework/program/var_desc.h delete mode 100644 mobile/src/framework/scope.cpp delete mode 100644 mobile/src/framework/scope.h delete mode 100644 mobile/src/framework/selected_rows.cpp delete mode 100644 mobile/src/framework/selected_rows.h delete mode 100644 mobile/src/framework/tensor.h delete mode 100644 mobile/src/framework/tensor_base.h delete mode 100644 mobile/src/framework/tensor_util.cpp delete mode 100644 mobile/src/framework/tensor_util.h delete mode 100644 mobile/src/framework/type_trait.h delete mode 100644 mobile/src/framework/variable.h delete mode 100644 mobile/src/framework/zynqmp/ztensor.hpp delete mode 100644 mobile/src/io/api.cc delete mode 100644 mobile/src/io/api_paddle_mobile.cc delete mode 100644 mobile/src/io/api_paddle_mobile.h delete mode 100644 mobile/src/io/ios_io/PaddleMobileCPU.h delete mode 100644 mobile/src/io/ios_io/PaddleMobileCPU.mm delete mode 100644 mobile/src/io/jni/PML.java delete mode 100644 mobile/src/io/jni/paddle_mobile_jni.cpp delete mode 100644 mobile/src/io/jni/paddle_mobile_jni.h delete mode 100644 mobile/src/io/loader.h delete mode 100644 mobile/src/io/opencl_interface.cpp delete mode 100644 mobile/src/io/opencl_interface.h delete mode 100644 mobile/src/io/paddle_inference_api.h delete mode 100644 mobile/src/io/paddle_mobile.cpp delete mode 100644 mobile/src/io/paddle_mobile.h delete mode 100644 mobile/src/io/paddle_mobile_wrap.cpp delete mode 100644 mobile/src/io/paddle_mobile_wrap.h delete mode 100644 mobile/src/io/paddle_test_inference_api.cpp delete mode 100644 mobile/src/io/paddle_test_inference_api.h delete mode 100755 mobile/src/memory/t_malloc.cpp delete mode 100644 mobile/src/memory/t_malloc.h delete mode 100755 mobile/src/operators/activation_op.cpp delete mode 100644 mobile/src/operators/activation_op.h delete mode 100644 mobile/src/operators/assign_op.cpp delete mode 100644 mobile/src/operators/assign_op.h delete mode 100644 mobile/src/operators/assign_value_op.cpp delete mode 100644 mobile/src/operators/assign_value_op.h delete mode 100644 mobile/src/operators/batchnorm_op.cpp delete mode 100644 mobile/src/operators/batchnorm_op.h delete mode 100644 mobile/src/operators/beam_search_decode_op.cpp delete mode 100644 mobile/src/operators/beam_search_decode_op.h delete mode 100644 mobile/src/operators/beam_search_op.cpp delete mode 100644 mobile/src/operators/beam_search_op.h delete mode 100644 mobile/src/operators/bilinear_interp_op.cpp delete mode 100644 mobile/src/operators/bilinear_interp_op.h delete mode 100644 mobile/src/operators/box_coder_op.cpp delete mode 100644 mobile/src/operators/box_coder_op.h delete mode 100644 mobile/src/operators/cast_op.cpp delete mode 100644 mobile/src/operators/cast_op.h delete mode 100644 mobile/src/operators/compare_op.cpp delete mode 100644 mobile/src/operators/compare_op.h delete mode 100644 mobile/src/operators/concat_op.cpp delete mode 100644 mobile/src/operators/concat_op.h delete mode 100644 mobile/src/operators/conditional_block_op.cpp delete mode 100644 mobile/src/operators/conditional_block_op.h delete mode 100644 mobile/src/operators/controlflow/tensor_array_read_write_op.cpp delete mode 100644 mobile/src/operators/controlflow/tensor_array_read_write_op.h delete mode 100644 mobile/src/operators/controlflow/while_op.cpp delete mode 100644 mobile/src/operators/controlflow/while_op.h delete mode 100644 mobile/src/operators/conv_op.cpp delete mode 100644 mobile/src/operators/conv_op.h delete mode 100755 mobile/src/operators/conv_transpose_op.cpp delete mode 100755 mobile/src/operators/conv_transpose_op.h delete mode 100644 mobile/src/operators/crf_op.cpp delete mode 100644 mobile/src/operators/crf_op.h delete mode 100644 mobile/src/operators/depthwise_conv_op.cpp delete mode 100644 mobile/src/operators/depthwise_conv_op.h delete mode 100644 mobile/src/operators/dequantize_op.cpp delete mode 100644 mobile/src/operators/dequantize_op.h delete mode 100644 mobile/src/operators/detection_ops.cpp delete mode 100644 mobile/src/operators/detection_ops.h delete mode 100644 mobile/src/operators/dropout_op.cpp delete mode 100644 mobile/src/operators/dropout_op.h delete mode 100644 mobile/src/operators/elementwise_add_op.cpp delete mode 100644 mobile/src/operators/elementwise_add_op.h delete mode 100644 mobile/src/operators/elementwise_mul_op.cpp delete mode 100644 mobile/src/operators/elementwise_mul_op.h delete mode 100644 mobile/src/operators/elementwise_sub_op.cpp delete mode 100644 mobile/src/operators/elementwise_sub_op.h delete mode 100644 mobile/src/operators/exp_op.cpp delete mode 100644 mobile/src/operators/exp_op.h delete mode 100644 mobile/src/operators/expand_op.cpp delete mode 100644 mobile/src/operators/expand_op.h delete mode 100644 mobile/src/operators/feed_op.cpp delete mode 100644 mobile/src/operators/feed_op.h delete mode 100644 mobile/src/operators/fetch_op.cpp delete mode 100644 mobile/src/operators/fetch_op.h delete mode 100644 mobile/src/operators/fill_constant_batch_size_like_op.cpp delete mode 100644 mobile/src/operators/fill_constant_batch_size_like_op.h delete mode 100644 mobile/src/operators/fill_constant_op.cpp delete mode 100644 mobile/src/operators/fill_constant_op.h delete mode 100644 mobile/src/operators/flatten2_op.cpp delete mode 100644 mobile/src/operators/flatten2_op.h delete mode 100644 mobile/src/operators/flatten_op.cpp delete mode 100644 mobile/src/operators/flatten_op.h delete mode 100644 mobile/src/operators/fusion_conv_add_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_add_bn_op.h delete mode 100644 mobile/src/operators/fusion_conv_add_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_add_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_conv_add_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_add_op.h delete mode 100644 mobile/src/operators/fusion_conv_add_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_add_relu_op.h delete mode 100644 mobile/src/operators/fusion_conv_bn_add_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_bn_add_relu_op.h delete mode 100644 mobile/src/operators/fusion_conv_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_bn_op.h delete mode 100644 mobile/src/operators/fusion_conv_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_conv_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_relu_op.h delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_op.h delete mode 100755 mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_deconv_add_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_add_op.h delete mode 100644 mobile/src/operators/fusion_deconv_add_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_add_relu_op.h delete mode 100644 mobile/src/operators/fusion_deconv_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_deconv_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_relu_op.h delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_op.h delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h delete mode 100644 mobile/src/operators/fusion_dequant_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_dequant_bn_op.h delete mode 100644 mobile/src/operators/fusion_dequant_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_dwconv_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_dwconv_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_elementwise_add_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_elementwise_add_relu_op.h delete mode 100644 mobile/src/operators/fusion_fc_op.cpp delete mode 100644 mobile/src/operators/fusion_fc_op.h delete mode 100644 mobile/src/operators/fusion_fc_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_fc_relu_op.h delete mode 100644 mobile/src/operators/fusion_instancenorm_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_instancenorm_relu_op.h delete mode 100644 mobile/src/operators/grid_sampler_op.cpp delete mode 100644 mobile/src/operators/grid_sampler_op.h delete mode 100644 mobile/src/operators/gru_op.cpp delete mode 100644 mobile/src/operators/gru_op.h delete mode 100644 mobile/src/operators/gru_unit_op.cpp delete mode 100644 mobile/src/operators/gru_unit_op.h delete mode 100644 mobile/src/operators/im2sequence_op.cpp delete mode 100644 mobile/src/operators/im2sequence_op.h delete mode 100644 mobile/src/operators/increment_op.cpp delete mode 100644 mobile/src/operators/increment_op.h delete mode 100644 mobile/src/operators/instancenorm_op.cpp delete mode 100644 mobile/src/operators/instancenorm_op.h delete mode 100644 mobile/src/operators/is_empty_op.cpp delete mode 100644 mobile/src/operators/is_empty_op.h delete mode 100644 mobile/src/operators/kernel/activation_kernel.h delete mode 100644 mobile/src/operators/kernel/arm/activation_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/assign_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/assign_value_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/batchnorm_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/beam_search_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/box_coder_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/cast_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/compare_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/concat_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/conditional_block_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_common.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_common.h delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/crf_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/dequantize_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/dropout_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/exp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/flatten_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/gru_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/gru_unit_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/im2sequence_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/increment_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/is_empty_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/lod_reset_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/logical_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/lookup_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/lrn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/norm_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/one_hot_kernel.cpp delete mode 100755 mobile/src/operators/kernel/arm/pad2d_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/prelu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/prior_box_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/proposal_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/quantize_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/reshape2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/reshape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/resize_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/scale_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/shape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/slice_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/split_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/sum_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/top_k_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/transpose2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/while_kernel.cpp delete mode 100644 mobile/src/operators/kernel/assign_kernel.h delete mode 100644 mobile/src/operators/kernel/assign_value_kernel.h delete mode 100644 mobile/src/operators/kernel/batchnorm_kernel.h delete mode 100644 mobile/src/operators/kernel/beam_search_decode_kernel.h delete mode 100644 mobile/src/operators/kernel/beam_search_kernel.h delete mode 100644 mobile/src/operators/kernel/bilinear_interp_kernel.h delete mode 100644 mobile/src/operators/kernel/box_coder_kernel.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/activation_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/concat_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/crf_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/gru_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/increment_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/mul_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/norm_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/pool_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/shape_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/split_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/sum_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h delete mode 100644 mobile/src/operators/kernel/cl/batchnorm_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/box_coder_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp delete mode 100644 mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h delete mode 100644 mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp delete mode 100644 mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/cl_common.h delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/expend.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/relu.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/relu6.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/reshape.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/softmax.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/concat_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/dropout_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/exp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/expand_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/flatten2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/gen_code.py delete mode 100644 mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/instancenorm_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/lrn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/pad2d_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/prior_box_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/relu6_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/reshape2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/reshape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/scale_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/sigmoid_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/slice_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/split_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/tanh_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/transpose2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/compare_kernel.h delete mode 100644 mobile/src/operators/kernel/concat_kernel.h delete mode 100644 mobile/src/operators/kernel/conditional_block_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_add_bn_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_add_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_add_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_add_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_bn_add_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_bn_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_transpose_kernel.h delete mode 100644 mobile/src/operators/kernel/crf_kernel.h delete mode 100755 mobile/src/operators/kernel/deconv_add_bn_kernel.h delete mode 100755 mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/deconv_add_kernel.h delete mode 100644 mobile/src/operators/kernel/deconv_add_relu_kernel.h delete mode 100755 mobile/src/operators/kernel/deconv_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/deconv_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/dequant_bn_kernel.h delete mode 100644 mobile/src/operators/kernel/dequantize_kernel.h delete mode 100644 mobile/src/operators/kernel/detection_kernel.h delete mode 100644 mobile/src/operators/kernel/dropout_kernel.h delete mode 100644 mobile/src/operators/kernel/dwconv_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/elementwise_add_kernel.h delete mode 100644 mobile/src/operators/kernel/elementwise_add_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/elementwise_mul_kernel.h delete mode 100644 mobile/src/operators/kernel/elementwise_sub_kernel.h delete mode 100644 mobile/src/operators/kernel/exp_kernel.h delete mode 100644 mobile/src/operators/kernel/expand_kernel.h delete mode 100644 mobile/src/operators/kernel/fc_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/feed_kernel.h delete mode 100644 mobile/src/operators/kernel/fetch_kernel.h delete mode 100644 mobile/src/operators/kernel/flatten2_kernel.h delete mode 100644 mobile/src/operators/kernel/flatten_kernel.h delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp delete mode 100755 mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp delete mode 100755 mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/split_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp delete mode 100755 mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp delete mode 100755 mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp delete mode 100755 mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/split_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fusion_fc_kernel.h delete mode 100644 mobile/src/operators/kernel/grid_sampler_kernel.h delete mode 100644 mobile/src/operators/kernel/gru_kernel.h delete mode 100644 mobile/src/operators/kernel/gru_unit_kernel.h delete mode 100644 mobile/src/operators/kernel/im2sequence_kernel.h delete mode 100644 mobile/src/operators/kernel/increment_kernel.h delete mode 100644 mobile/src/operators/kernel/instancenorm_kernel.h delete mode 100644 mobile/src/operators/kernel/instancenorm_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/is_empty_kernel.h delete mode 100644 mobile/src/operators/kernel/kernels.h delete mode 100644 mobile/src/operators/kernel/logical_kernel.h delete mode 100644 mobile/src/operators/kernel/lookup_kernel.h delete mode 100644 mobile/src/operators/kernel/lrn_kernel.h delete mode 100644 mobile/src/operators/kernel/mul_kernel.h delete mode 100644 mobile/src/operators/kernel/multiclass_nms_kernel.h delete mode 100644 mobile/src/operators/kernel/nearest_interp_kernel.h delete mode 100644 mobile/src/operators/kernel/norm_kernel.h delete mode 100644 mobile/src/operators/kernel/one_hot_kernel.h delete mode 100644 mobile/src/operators/kernel/pad2d_kernel.h delete mode 100644 mobile/src/operators/kernel/pixel_shuffle_kernel.h delete mode 100644 mobile/src/operators/kernel/polygon_box_transform_kernel.h delete mode 100644 mobile/src/operators/kernel/pool_kernel.h delete mode 100644 mobile/src/operators/kernel/prelu_kernel.h delete mode 100644 mobile/src/operators/kernel/prior_box_kernel.h delete mode 100644 mobile/src/operators/kernel/quantize_kernel.h delete mode 100644 mobile/src/operators/kernel/range_kernel.cpp delete mode 100644 mobile/src/operators/kernel/range_kernel.h delete mode 100644 mobile/src/operators/kernel/reduce_prod_kernel.cpp delete mode 100644 mobile/src/operators/kernel/reduce_prod_kernel.h delete mode 100644 mobile/src/operators/kernel/reshape2_kernel.h delete mode 100644 mobile/src/operators/kernel/reshape_kernel.h delete mode 100644 mobile/src/operators/kernel/resize_kernel.h delete mode 100644 mobile/src/operators/kernel/scale_kernel.h delete mode 100644 mobile/src/operators/kernel/sequence_kernels.h delete mode 100644 mobile/src/operators/kernel/shape_kernel.h delete mode 100644 mobile/src/operators/kernel/slice_kernel.h delete mode 100644 mobile/src/operators/kernel/softmax_kernel.h delete mode 100644 mobile/src/operators/kernel/split_kernel.h delete mode 100644 mobile/src/operators/kernel/sum_kernel.h delete mode 100644 mobile/src/operators/kernel/tanh_kernel.h delete mode 100644 mobile/src/operators/kernel/tensor_array_read_write_kernel.h delete mode 100644 mobile/src/operators/kernel/transpose2_kernel.h delete mode 100644 mobile/src/operators/kernel/transpose_kernel.h delete mode 100644 mobile/src/operators/kernel/while_kernel.h delete mode 100644 mobile/src/operators/lod_reset_op.cpp delete mode 100644 mobile/src/operators/lod_reset_op.h delete mode 100644 mobile/src/operators/logical_op.cpp delete mode 100644 mobile/src/operators/logical_op.h delete mode 100644 mobile/src/operators/lookup_op.cpp delete mode 100644 mobile/src/operators/lookup_op.h delete mode 100644 mobile/src/operators/lrn_op.cpp delete mode 100644 mobile/src/operators/lrn_op.h delete mode 100644 mobile/src/operators/math/activation.h delete mode 100644 mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h delete mode 100644 mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp delete mode 100644 mobile/src/operators/math/depthwise_conv3x3.cpp delete mode 100644 mobile/src/operators/math/depthwise_conv3x3.h delete mode 100644 mobile/src/operators/math/depthwise_conv3x3_int8.cpp delete mode 100644 mobile/src/operators/math/depthwise_conv5x5.cpp delete mode 100644 mobile/src/operators/math/depthwise_conv5x5.h delete mode 100644 mobile/src/operators/math/depthwise_conv5x5_int8.cpp delete mode 100644 mobile/src/operators/math/element_wise.h delete mode 100644 mobile/src/operators/math/elementwise_op_function.h delete mode 100644 mobile/src/operators/math/gemm.cpp delete mode 100644 mobile/src/operators/math/gemm.h delete mode 100644 mobile/src/operators/math/gemm/cblas.cc delete mode 100644 mobile/src/operators/math/gemm/cblas.h delete mode 100644 mobile/src/operators/math/gemm/executor.h delete mode 100644 mobile/src/operators/math/gemm/gemm1x1s1.cpp delete mode 100644 mobile/src/operators/math/gemm/gemm1x1s1.h delete mode 100644 mobile/src/operators/math/gemm/gemm_kernel.h delete mode 100644 mobile/src/operators/math/gemm/pack_kernel.h delete mode 100644 mobile/src/operators/math/gemm/strategy.h delete mode 100644 mobile/src/operators/math/gemm_int8.cpp delete mode 100644 mobile/src/operators/math/gemm_omp_int8.cpp delete mode 100644 mobile/src/operators/math/gpc.cpp delete mode 100644 mobile/src/operators/math/gpc.h delete mode 100644 mobile/src/operators/math/gru_compute.cpp delete mode 100644 mobile/src/operators/math/gru_compute.h delete mode 100644 mobile/src/operators/math/gru_cpu_kernel.h delete mode 100644 mobile/src/operators/math/im2col.cpp delete mode 100644 mobile/src/operators/math/im2col.h delete mode 100644 mobile/src/operators/math/math.h delete mode 100644 mobile/src/operators/math/math_function.cpp delete mode 100644 mobile/src/operators/math/math_function.h delete mode 100644 mobile/src/operators/math/math_function_int8.cpp delete mode 100644 mobile/src/operators/math/pad.cpp delete mode 100644 mobile/src/operators/math/pad.h delete mode 100644 mobile/src/operators/math/poly_util.cpp delete mode 100644 mobile/src/operators/math/poly_util.h delete mode 100644 mobile/src/operators/math/pooling.cpp delete mode 100644 mobile/src/operators/math/pooling.h delete mode 100644 mobile/src/operators/math/pooling2x2.cpp delete mode 100644 mobile/src/operators/math/pooling3x3.cpp delete mode 100644 mobile/src/operators/math/quantize.h delete mode 100644 mobile/src/operators/math/selected_rows_functor.h delete mode 100644 mobile/src/operators/math/sequence2batch.cpp delete mode 100644 mobile/src/operators/math/sequence2batch.h delete mode 100644 mobile/src/operators/math/slidingwindow_conv3x3.cpp delete mode 100644 mobile/src/operators/math/slidingwindow_conv3x3.h delete mode 100644 mobile/src/operators/math/slidingwindow_utils.cpp delete mode 100644 mobile/src/operators/math/slidingwindow_utils.h delete mode 100644 mobile/src/operators/math/softmax.cpp delete mode 100644 mobile/src/operators/math/softmax.h delete mode 100644 mobile/src/operators/math/transform.h delete mode 100644 mobile/src/operators/math/vol2col.cpp delete mode 100644 mobile/src/operators/math/vol2col.h delete mode 100644 mobile/src/operators/math/winograd/winograd_transform.h delete mode 100644 mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp delete mode 100644 mobile/src/operators/mul_op.cpp delete mode 100644 mobile/src/operators/mul_op.h delete mode 100644 mobile/src/operators/multiclass_nms_op.cpp delete mode 100644 mobile/src/operators/multiclass_nms_op.h delete mode 100644 mobile/src/operators/nearest_interp_op.cpp delete mode 100644 mobile/src/operators/nearest_interp_op.h delete mode 100644 mobile/src/operators/norm_op.cpp delete mode 100644 mobile/src/operators/norm_op.h delete mode 100644 mobile/src/operators/one_hot_op.cpp delete mode 100644 mobile/src/operators/one_hot_op.h delete mode 100644 mobile/src/operators/op_param.cpp delete mode 100644 mobile/src/operators/op_param.h delete mode 100755 mobile/src/operators/pad2d_op.cpp delete mode 100644 mobile/src/operators/pad2d_op.h delete mode 100644 mobile/src/operators/pixel_shuffle_op.cpp delete mode 100644 mobile/src/operators/pixel_shuffle_op.h delete mode 100644 mobile/src/operators/polygon_box_transform_op.cpp delete mode 100644 mobile/src/operators/polygon_box_transform_op.h delete mode 100644 mobile/src/operators/pool_op.cpp delete mode 100644 mobile/src/operators/pool_op.h delete mode 100644 mobile/src/operators/prelu_op.cpp delete mode 100644 mobile/src/operators/prelu_op.h delete mode 100644 mobile/src/operators/prior_box_op.cpp delete mode 100644 mobile/src/operators/prior_box_op.h delete mode 100644 mobile/src/operators/quantize_op.cpp delete mode 100644 mobile/src/operators/quantize_op.h delete mode 100644 mobile/src/operators/range_op.cpp delete mode 100644 mobile/src/operators/range_op.h delete mode 100644 mobile/src/operators/reduce_prod_op.cpp delete mode 100644 mobile/src/operators/reduce_prod_op.h delete mode 100644 mobile/src/operators/reshape2_op.cpp delete mode 100644 mobile/src/operators/reshape2_op.h delete mode 100644 mobile/src/operators/reshape_op.cpp delete mode 100644 mobile/src/operators/reshape_op.h delete mode 100644 mobile/src/operators/resize_op.cpp delete mode 100644 mobile/src/operators/resize_op.h delete mode 100644 mobile/src/operators/scale_op.cpp delete mode 100644 mobile/src/operators/scale_op.h delete mode 100644 mobile/src/operators/sequence_ops/sequence_expand_op.cpp delete mode 100644 mobile/src/operators/sequence_ops/sequence_expand_op.h delete mode 100644 mobile/src/operators/sequence_ops/sequence_pool_op.cpp delete mode 100644 mobile/src/operators/sequence_ops/sequence_pool_op.h delete mode 100644 mobile/src/operators/sequence_ops/sequence_softmax_op.cpp delete mode 100644 mobile/src/operators/sequence_ops/sequence_softmax_op.h delete mode 100644 mobile/src/operators/shape_op.cpp delete mode 100644 mobile/src/operators/shape_op.h delete mode 100644 mobile/src/operators/slice_op.cpp delete mode 100644 mobile/src/operators/slice_op.h delete mode 100644 mobile/src/operators/softmax_op.cpp delete mode 100644 mobile/src/operators/softmax_op.h delete mode 100644 mobile/src/operators/split_op.cpp delete mode 100644 mobile/src/operators/split_op.h delete mode 100644 mobile/src/operators/sum_op.cpp delete mode 100644 mobile/src/operators/sum_op.h delete mode 100644 mobile/src/operators/top_k_op.cpp delete mode 100644 mobile/src/operators/top_k_op.h delete mode 100644 mobile/src/operators/transpose2_op.cpp delete mode 100644 mobile/src/operators/transpose2_op.h delete mode 100644 mobile/src/operators/transpose_op.cpp delete mode 100644 mobile/src/operators/transpose_op.h delete mode 100644 mobile/src/pass/memory_optimize.cpp delete mode 100644 mobile/src/pass/memory_optimize.h delete mode 100644 mobile/src/pass/memory_optimize_cl.cpp delete mode 100644 mobile/src/pass/memory_optimize_cl.h delete mode 100644 mobile/src/pass/model_obfuscate.cpp delete mode 100644 mobile/src/pass/model_obfuscate.h delete mode 100644 mobile/src/pass/pass_base.h delete mode 100644 mobile/src/protobuf-c/protobuf-c.cpp delete mode 100644 mobile/src/protobuf-c/protobuf-c.h delete mode 100644 mobile/test/CMakeLists.txt delete mode 100644 mobile/test/common/test_enforce.cpp delete mode 100644 mobile/test/common/test_gemm_accuracy.cpp delete mode 100644 mobile/test/common/test_gemm_int8_accuracy.cpp delete mode 100644 mobile/test/common/test_gemm_perf.cpp delete mode 100644 mobile/test/common/test_lib_size.cpp delete mode 100644 mobile/test/common/test_lib_size.h delete mode 100644 mobile/test/common/test_log.cpp delete mode 100644 mobile/test/common/test_openmp.cpp delete mode 100644 mobile/test/executor_for_test.h delete mode 100644 mobile/test/executor_for_test_opencl.h delete mode 100644 mobile/test/fpga/test_concat_op.cpp delete mode 100644 mobile/test/fpga/test_densebox_combine.cpp delete mode 100644 mobile/test/fpga/test_format_data.cpp delete mode 100644 mobile/test/fpga/test_marker.cpp delete mode 100644 mobile/test/fpga/test_marker2.cpp delete mode 100644 mobile/test/fpga/test_marker_api.cpp delete mode 100644 mobile/test/fpga/test_mobilenet_api.cpp delete mode 100644 mobile/test/fpga/test_pe.cpp delete mode 100644 mobile/test/fpga/test_resnet50.cpp delete mode 100644 mobile/test/fpga/test_rfcn.cpp delete mode 100644 mobile/test/fpga/test_rfcn_api.cpp delete mode 100644 mobile/test/fpga/test_ssd.cpp delete mode 100644 mobile/test/fpga/test_tensor_quant.cpp delete mode 100644 mobile/test/fpga/test_yolo_api.cpp delete mode 100644 mobile/test/framework/test_inference_api.cpp delete mode 100644 mobile/test/framework/test_load.cpp delete mode 100644 mobile/test/framework/test_load_memory.cpp delete mode 100644 mobile/test/framework/test_load_memory_inference_api.cpp delete mode 100644 mobile/test/framework/test_optimize.cpp delete mode 100644 mobile/test/net/test_alexnet.cpp delete mode 100644 mobile/test/net/test_benchmark.cpp delete mode 100644 mobile/test/net/test_eng.cpp delete mode 100644 mobile/test/net/test_genet_combine.cpp delete mode 100644 mobile/test/net/test_gesture.cpp delete mode 100644 mobile/test/net/test_googlenet.cpp delete mode 100644 mobile/test/net/test_googlenet_quali.cpp delete mode 100644 mobile/test/net/test_googlenetv1_combine.cpp delete mode 100644 mobile/test/net/test_inceptionv4.cpp delete mode 100644 mobile/test/net/test_inference_ercy.cpp delete mode 100644 mobile/test/net/test_inference_imfix.cpp delete mode 100644 mobile/test/net/test_inference_m2fm.cpp delete mode 100644 mobile/test/net/test_inference_pre_post.cpp delete mode 100644 mobile/test/net/test_mobilenet+ssd.cpp delete mode 100644 mobile/test/net/test_mobilenet.cpp delete mode 100644 mobile/test/net/test_mobilenet_025_fssd.cpp delete mode 100644 mobile/test/net/test_mobilenet_GPU.cpp delete mode 100644 mobile/test/net/test_mobilenet_combine.cpp delete mode 100644 mobile/test/net/test_mobilenet_male2fe.cpp delete mode 100644 mobile/test/net/test_multi_inference_predict.cpp delete mode 100644 mobile/test/net/test_net.cpp delete mode 100644 mobile/test/net/test_net_benchmark.cpp delete mode 100644 mobile/test/net/test_net_multi_feed.cpp delete mode 100644 mobile/test/net/test_net_performance.cpp delete mode 100644 mobile/test/net/test_nlp.cpp delete mode 100644 mobile/test/net/test_ocr.cpp delete mode 100644 mobile/test/net/test_op_in_net.cpp delete mode 100644 mobile/test/net/test_resnet.cpp delete mode 100644 mobile/test/net/test_squeezenet.cpp delete mode 100644 mobile/test/net/test_super.cpp delete mode 100644 mobile/test/net/test_vgg16ssd.cpp delete mode 100644 mobile/test/net/test_wrap.cpp delete mode 100644 mobile/test/net/test_yolo.cpp delete mode 100644 mobile/test/net/test_yolo_combined.cpp delete mode 100644 mobile/test/net/test_yologpu.cpp delete mode 100644 mobile/test/operators/test_batchnorm_op.cpp delete mode 100644 mobile/test/operators/test_box_coder_op.cpp delete mode 100644 mobile/test/operators/test_cast_op.cpp delete mode 100644 mobile/test/operators/test_concat_op.cpp delete mode 100644 mobile/test/operators/test_conv_add_relu_op.cpp delete mode 100644 mobile/test/operators/test_conv_bn_relu_op.cpp delete mode 100644 mobile/test/operators/test_conv_gpu.cpp delete mode 100644 mobile/test/operators/test_conv_op.cpp delete mode 100644 mobile/test/operators/test_depthwise_conv_op.cpp delete mode 100644 mobile/test/operators/test_dequantize_op.cpp delete mode 100644 mobile/test/operators/test_dwconv_bn_relu_op.cpp delete mode 100644 mobile/test/operators/test_elementwise_add_op.cpp delete mode 100644 mobile/test/operators/test_elementwise_sub_op.cpp delete mode 100644 mobile/test/operators/test_expend_op.cpp delete mode 100644 mobile/test/operators/test_fill_constant_op.cpp delete mode 100644 mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp delete mode 100644 mobile/test/operators/test_fusion_fc_op.cpp delete mode 100644 mobile/test/operators/test_gru_op.cpp delete mode 100644 mobile/test/operators/test_im2sequence_op.cpp delete mode 100644 mobile/test/operators/test_increment_op.cpp delete mode 100644 mobile/test/operators/test_is_empty_op.cpp delete mode 100644 mobile/test/operators/test_leaky_relu_op.cpp delete mode 100644 mobile/test/operators/test_less_than_op.cpp delete mode 100644 mobile/test/operators/test_log_op.cpp delete mode 100644 mobile/test/operators/test_logical_and_op.cpp delete mode 100644 mobile/test/operators/test_logical_not_op.cpp delete mode 100644 mobile/test/operators/test_logical_or_op.cpp delete mode 100644 mobile/test/operators/test_logical_xor_op.cpp delete mode 100644 mobile/test/operators/test_lrn_op.cpp delete mode 100644 mobile/test/operators/test_mul_op.cpp delete mode 100644 mobile/test/operators/test_multiclass_nms_op.cpp delete mode 100644 mobile/test/operators/test_polygon_box_transform_op.cpp delete mode 100644 mobile/test/operators/test_pool_op.cpp delete mode 100644 mobile/test/operators/test_prelu_op.cpp delete mode 100644 mobile/test/operators/test_prior_box_op.cpp delete mode 100644 mobile/test/operators/test_quantize_op.cpp delete mode 100644 mobile/test/operators/test_relu6_op.cpp delete mode 100644 mobile/test/operators/test_relu_op.cpp delete mode 100644 mobile/test/operators/test_reshape2_op.cpp delete mode 100644 mobile/test/operators/test_reshape_op.cpp delete mode 100644 mobile/test/operators/test_resize_op.cpp delete mode 100644 mobile/test/operators/test_scale_op.cpp delete mode 100644 mobile/test/operators/test_sequence_expand_op.cpp delete mode 100644 mobile/test/operators/test_sequence_pool_op.cpp delete mode 100644 mobile/test/operators/test_sequence_softmax_op.cpp delete mode 100644 mobile/test/operators/test_sigmoid_op.cpp delete mode 100644 mobile/test/operators/test_slice_op.cpp delete mode 100644 mobile/test/operators/test_softmax_op.cpp delete mode 100644 mobile/test/operators/test_sum_op.cpp delete mode 100644 mobile/test/operators/test_tanh_op.cpp delete mode 100644 mobile/test/operators/test_topk_op.cpp delete mode 100644 mobile/test/operators/test_transpose2_op.cpp delete mode 100644 mobile/test/operators/test_transpose_op.cpp delete mode 100644 mobile/test/test_helper.h delete mode 100644 mobile/test/test_include.h delete mode 100644 mobile/third_party/opencl/.gitinore delete mode 100644 mobile/tools/android-cmake/android.toolchain.cmake delete mode 100644 mobile/tools/android-debug-script/push2android.sh delete mode 100644 mobile/tools/android-debug-script/run_on_android.sh delete mode 100644 mobile/tools/arm-platform.cmake delete mode 100755 mobile/tools/build.sh delete mode 100755 mobile/tools/build_android_armv7.sh delete mode 100755 mobile/tools/build_android_armv8.sh delete mode 100755 mobile/tools/ci_build.sh delete mode 100644 mobile/tools/ci_run_test.sh delete mode 100644 mobile/tools/docker_build_fpga.sh delete mode 100644 mobile/tools/ios-cmake/ios.toolchain.cmake delete mode 100644 mobile/tools/net-detail.awk delete mode 100644 mobile/tools/net.awk delete mode 100755 mobile/tools/op.cmake delete mode 100644 mobile/tools/pre-commit.hooks/clang-format.hook delete mode 100755 mobile/tools/pre-commit.hooks/clang-tidy.hook delete mode 100644 mobile/tools/pre-commit.hooks/copyright.hook delete mode 100644 mobile/tools/pre-commit.hooks/cpplint.hook delete mode 100755 mobile/tools/prepare_images_and_models.sh delete mode 100644 mobile/tools/profile_show.sh delete mode 100644 mobile/tools/python/caffetools/run.py delete mode 100644 mobile/tools/python/fluidtools/.gitignore delete mode 100644 mobile/tools/python/fluidtools/run.py delete mode 100644 mobile/tools/python/fluidtools/run_multi_feed.py delete mode 100644 mobile/tools/python/fluidtools/test_wrap.py delete mode 100644 mobile/tools/python/imagetools/README.md delete mode 100644 mobile/tools/python/imagetools/imagetools.py delete mode 100644 mobile/tools/python/imagetools/img2nchw.py delete mode 100644 mobile/tools/python/imagetools/img2nhwc.py delete mode 100644 mobile/tools/python/imagetools/numpy2binary.py delete mode 100644 mobile/tools/python/misc/.gitignore delete mode 100644 mobile/tools/python/misc/fluidtools.py delete mode 100644 mobile/tools/python/misc/ios-test-server.py delete mode 100644 mobile/tools/python/misc/restore-git.py delete mode 100644 mobile/tools/python/misc/test-fluid-op-feature.py delete mode 100644 mobile/tools/python/modeltools/.gitignore delete mode 100644 mobile/tools/python/modeltools/core/__init__.py delete mode 100644 mobile/tools/python/modeltools/core/framework.proto delete mode 100644 mobile/tools/python/modeltools/core/framework_pb2.py delete mode 100644 mobile/tools/python/modeltools/core/op_types.py delete mode 100644 mobile/tools/python/modeltools/mobilenet/__init__.py delete mode 100644 mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py delete mode 100644 mobile/tools/python/modeltools/mobilenet/swicher.py delete mode 100644 mobile/tools/python/modeltools/tools/__init__.py delete mode 100644 mobile/tools/python/modeltools/tools/float2halffloat.py delete mode 100644 mobile/tools/python/modeltools/tools/loader.py delete mode 100644 mobile/tools/python/modeltools/tools/model_combine.py delete mode 100644 mobile/tools/python/modeltools/tools/model_reader.py delete mode 100644 mobile/tools/python/modeltools/yolo/__init__.py delete mode 100644 mobile/tools/python/modeltools/yolo/mdl2fluid.py delete mode 100644 mobile/tools/python/modeltools/yolo/swicher.py delete mode 100644 mobile/tools/quantification/CMakeLists.txt delete mode 100644 mobile/tools/quantification/README.md delete mode 100644 mobile/tools/quantification/convert.cpp delete mode 100644 mobile/tools/quantification/scripts/run.py delete mode 100644 mobile/tools/quantification/src/block_desc_local.cpp delete mode 100644 mobile/tools/quantification/src/block_desc_local.h delete mode 100644 mobile/tools/quantification/src/enforce.h delete mode 100644 mobile/tools/quantification/src/framework.pb-c.c delete mode 100644 mobile/tools/quantification/src/framework.pb-c.h delete mode 100644 mobile/tools/quantification/src/program_desc.cpp delete mode 100644 mobile/tools/quantification/src/program_desc.h delete mode 100644 mobile/tools/quantification/src/protobuf-c.c delete mode 100644 mobile/tools/quantification/src/protobuf-c.h delete mode 100644 mobile/tools/quantification/src/tensor_desc.h delete mode 100644 mobile/tools/quantification/src/var_desc.h delete mode 100644 mobile/tools/quantification/tune_n_fold.py delete mode 100755 mobile/tools/shell/change_mobile_namespace.sh delete mode 100644 mobile/tools/shell/check-bitcode.sh delete mode 100644 mobile/tools/shell/check-filename.sh delete mode 100644 mobile/tools/shell/generate-include/.gitignore delete mode 100644 mobile/tools/shell/generate-include/check_include_diff.sh delete mode 100644 mobile/tools/shell/generate-include/main.cpp delete mode 100644 mobile/tools/shell/generate-include/parse.py delete mode 100755 mobile/tools/shell/generate-include/run.sh delete mode 100644 mobile/tools/shell/merge.sh delete mode 100644 mobile/tools/shell/prune_static_library.sh delete mode 100644 mobile/tools/shell/restore-private-repo.sh delete mode 100644 mobile/tools/toolchains/arm-android-neon.cmake delete mode 100644 mobile/tools/toolchains/arm-linux-gnueabi.cmake delete mode 100644 mobile/tools/toolchains/arm-linux-gnueabihf.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index a28613647b..7a8f5e0a69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,12 +16,6 @@ cmake_minimum_required(VERSION 3.0) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") include(lite_utils) -lite_option(WITH_PADDLE_MOBILE "Use the paddle-mobile legacy build" OFF) -if (WITH_PADDLE_MOBILE) - add_subdirectory(mobile) - return() -endif(WITH_PADDLE_MOBILE) - set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(CMAKE_CXX_STANDARD 11) diff --git a/mobile/.clang-format b/mobile/.clang-format deleted file mode 100644 index d59e088579..0000000000 --- a/mobile/.clang-format +++ /dev/null @@ -1,5 +0,0 @@ ---- -Language: Cpp -BasedOnStyle: Google -Standard: Cpp11 -... diff --git a/mobile/.clang-tidy b/mobile/.clang-tidy deleted file mode 100644 index c788efe69d..0000000000 --- a/mobile/.clang-tidy +++ /dev/null @@ -1,67 +0,0 @@ -Checks: > - * - -android-* - -bugprone-bool-pointer-implicit-conversion - -cert-env33-c - -cert-dcl50-cpp - -cert-dcl59-cpp - -cppcoreguidelines-* - -fuchsia-* - -google-* - google-default-arguments - google-explicit-constructor - google-runtime-member-string-references - google-runtime-operator - -hicpp-braces-around-statements - -hicpp-named-parameter - -hicpp-no-array-decay - -hicpp-no-assembler - -hicpp-no-malloc - -hicpp-function-size - -hicpp-special-member-functions - -hicpp-vararg - -llvm-* - -objc-* - -readability-else-after-return - -readability-implicit-bool-conversion - -readability-named-parameter - -readability-simplify-boolean-expr - -readability-braces-around-statements - -readability-identifier-naming - -readability-function-size - -readability-redundant-member-init - -misc-bool-pointer-implicit-conversion - -misc-definitions-in-headers - -misc-unused-alias-decls - -misc-unused-parameters - -misc-unused-using-decls - -modernize-use-using - -modernize-use-default-member-init - -clang-diagnostic-* - -clang-analyzer-* -WarningsAsErrors: '*' -HeaderFilterRegex: '' -AnalyzeTemporaryDtors: false -FormatStyle: none -User: allonli -CheckOptions: - - key: google-readability-braces-around-statements.ShortStatementLines - value: '1' - - key: google-readability-function-size.StatementThreshold - value: '800' - - key: google-readability-namespace-comments.ShortNamespaceLines - value: '10' - - key: google-readability-namespace-comments.SpacesBeforeComments - value: '2' - - key: modernize-loop-convert.MaxCopySize - value: '16' - - key: modernize-loop-convert.MinConfidence - value: reasonable - - key: modernize-loop-convert.NamingStyle - value: CamelCase - - key: modernize-pass-by-value.IncludeStyle - value: llvm - - key: modernize-replace-auto-ptr.IncludeStyle - value: llvm - - key: modernize-use-nullptr.NullMacros - value: 'NULL' diff --git a/mobile/.gitignore b/mobile/.gitignore deleted file mode 100644 index 336f08fa8a..0000000000 --- a/mobile/.gitignore +++ /dev/null @@ -1,104 +0,0 @@ -opencl_kernels.cpp -# Prerequisites -*.d - -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai -*.la -*.lib -*.a - -# Executables -*.exe -*.out -*.app - -.DS_Store - -build/ - -.idea/ - -CMakeCache.txt - -CMakeFiles/ - -Makefile - -cmake_install.cmake - - -*.cbp - -paddle-mobile.cbp - -.idea - -compile_commands.json - -cmake-build-debug/ -cmake-build-release/ - -test/models/ - -test/images/ - -# Emacs intermediate files -*~ - -# CMake building directory -build - -# clion building directories -cmake-build-debug -cmake-build-release - -# ios -tools/libomp.a - -# ios demo -demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/ -demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg -demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a -*.xcuserstate -/tools/quantification/quantify - -# metal -Podfile.lock -metal/Pods/ -SwiftProtobuf.framework -paddle-mobile.xcworkspace -metal/models/ -metal/images/ -*.a -metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a -*.xcuserdatad/ -*/xcuserdata/ -/venv/ - -metal/paddle-mobile-demo/paddle-mobile-demo/images -metal/paddle-mobile-demo/paddle-mobile-demo/models -metal/paddle-mobile-demo/paddle-mobile-demo/Resources -metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images -metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models -metal/MobileNetDemo/MobileNetDemo/Resources -third_party/opencl/OpenCL-Headers diff --git a/mobile/.pre-commit-config.yaml b/mobile/.pre-commit-config.yaml deleted file mode 100644 index d9827afcd0..0000000000 --- a/mobile/.pre-commit-config.yaml +++ /dev/null @@ -1,69 +0,0 @@ -repos: -- repo: https://github.com/Lucas-C/pre-commit-hooks.git - sha: v1.0.1 - hooks: - - id: remove-crlf - files: ^(mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$ - exclude: ^(lite/) - - id: remove-tabs - files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$ - exclude: ^(lite/) - -- repo: https://github.com/pre-commit/pre-commit-hooks - sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0 - hooks: - - id: check-added-large-files - exclude: ^(lite/) - - id: check-merge-conflict - exclude: ^(lite/) - - id: check-symlinks - exclude: ^(lite/) - - id: detect-private-key - files: (?!.*tar.gz)^.*$ - exclude: ^(lite/) - - id: end-of-file-fixer - files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$ - exclude: ^(lite/) - - id: trailing-whitespace - files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$ - exclude: ^(lite/) - -- repo: local - hooks: - - id: copyright - name: copyright - entry: python ./mobile/tools/pre-commit.hooks/copyright.hook - language: system - files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx|py)$ - exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ | ^(lite/) - -- repo: local - hooks: - - id: clang-format - name: clang-format - description: Format files with ClangFormat. - entry: bash ./mobile/tools/pre-commit.hooks/clang-format.hook -i - language: system - files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$ - exclude: ^(lite/) - -- repo: local - hooks: - - id: cpplint - name: cpplint - description: Check C++ code style using cpplint. - entry: bash ./mobile/tools/pre-commit.hooks/cpplint.hook - language: system - files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$ - exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$i | *\.pb\.cpp | ^(lite/) - - -# -#- repo: local -# hooks: -# - id: clang-tidy -# name: clang-tidy -# description: Check C++ code style using clang-tidy. -# entry: bash ./tools/pre-commit.hooks/.clang-tidy.hook -i -# language: system -# files: (src).*\.(c|cc|cxx|cpp|h|hpp|hxx)$ diff --git a/mobile/.travis.yml b/mobile/.travis.yml deleted file mode 100644 index 20fdddd5a1..0000000000 --- a/mobile/.travis.yml +++ /dev/null @@ -1,36 +0,0 @@ -language: cpp -cache: ccache -sudo: required -dist: trusty - -os: - - linux - -addons: - apt: - packages: - - git - - python - - python-pip - - python2.7-dev - - libc6-i386 - - curl - -compiler: - - clang - -before_install: - - sudo pip install -U virtualenv pre-commit pip - # Download and install recent cmake - -script: - - | - function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } - - | - timeout 600 .travis/pre-commit-job.sh # 10min timeout - RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi; - -notifications: - email: - on_success: change - on_failure: always diff --git a/mobile/.travis/pre-commit-job.sh b/mobile/.travis/pre-commit-job.sh deleted file mode 100755 index a0ae98dddd..0000000000 --- a/mobile/.travis/pre-commit-job.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -function abort(){ - echo "Your change doesn't follow Paddle-Moible's code style" 1>&2 - echo "Please use pre-commit to auto-format your code." 1>&2 - exit 1 -} - -trap 'abort' 0 -set -e -cd `dirname $0` -cd .. -export PATH=/usr/bin:$PATH -pre-commit install - -if ! pre-commit run -a ; then - ls -lh - git diff --exit-code - exit 1 -fi - -trap : 0 diff --git a/mobile/CMakeLists.txt b/mobile/CMakeLists.txt deleted file mode 100644 index 1883da8573..0000000000 --- a/mobile/CMakeLists.txt +++ /dev/null @@ -1,293 +0,0 @@ -cmake_minimum_required(VERSION 3.0.0) - -# basic build option -if(IS_IOS) - option(USE_OPENMP "build with openmp support" OFF) -else() - option(USE_OPENMP "build with openmp support" OFF) -endif() -option(USE_EXCEPTION "build with exception" ON) -option(WITH_LOGGING "print logging for debug" OFF) -option(WITH_SYMBOL "build with all symbols" ON) # turn off if use jni or ios io -option(WITH_PROFILE "print op profile for debug" OFF) -option(WITH_TEST "build with unit tests" ON) - -# select platform: CPU, GPU_CL, FPGA -option(CPU "build with arm CPU support" ON) -option(GPU_CL "build with OpenCL support" ON) -option(FPGA "build with FPGA support" OFF) -if(FPGA) - option(FPGAV1 "build with fpga v1 support" ON) - option(FPGAV2 "build with fpga v2 support" OFF) - option(FPGAKD "build with fpga KD support" OFF) -endif() - -project(paddle-mobile) - -# source code -file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) -file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) -include_directories(src/) - -# build flags -set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS} -Wno-attributes") -if(IS_IOS) - set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \ - -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") - add_compile_options(-fembed-bitcode) -else() - set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") -endif() - -# others -if(USE_OPENMP) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") - add_definitions(-DPADDLE_MOBILE_USE_OPENMP) -endif() - -if(WITH_LOGGING) - message(STATUS "Debugging mode") - add_definitions(-DPADDLE_MOBILE_DEBUG) -else() -endif() - -if(NOT WITH_SYMBOL) - add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) -endif() - -if(USE_EXCEPTION) - message(STATUS "Use exception") - add_definitions(-DENABLE_EXCEPTION -fexceptions) -else() - add_definitions(-fno-exceptions) -endif() - -if(WITH_PROFILE) - add_definitions(-DPADDLE_MOBILE_PROFILE) -endif() - -# platform control -if(ARM_LINUX) - include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake") -endif() - -if(CPU) - add_definitions(-DPADDLE_MOBILE_CPU) -else() - file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - - file(GLOB_RECURSE _tmp_list_h src/operators/kernel/arm/*.h) - foreach(f ${_tmp_list_h}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() -endif() - -if (GPU_CL) - add_definitions(-DPADDLE_MOBILE_CL) - - # opencl version - add_definitions(-DCL_TARGET_OPENCL_VERSION=220) - - if (ANDROID_ABI STREQUAL "arm64-v8a") - link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL-64.so) - else () - link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so) - endif () - - include_directories(third_party/opencl/OpenCL-Headers) -else() - file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - - file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h) - foreach(f ${_tmp_list_h}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() -endif() - -if(FPGA) - file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/math/*.cc src/operators/kernel/fpga/*.cc) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h) - foreach(f ${_tmp_list_h}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp) - list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h) - list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h) - if(FPGAV1) - add_definitions(-DPADDLE_MOBILE_FPGA) - message("FPGA_V1 enabled") - add_definitions(-DPADDLE_MOBILE_FPGA_V1) - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp - src/fpga/KD/*.h src/fpga/KD/*.hpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - endif() - if(FPGAV2) - add_definitions(-DPADDLE_MOBILE_FPGA) - message("FPGA_V2 enabled") - add_definitions(-DPADDLE_MOBILE_FPGA_V2) - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp - src/fpga/KD/*.h src/fpga/KD/*.hpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - endif() - if(FPGAKD) - message("FPGAKD enabled") - add_definitions(-DPADDLE_MOBILE_FPGA_KD) - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - - file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.h) - foreach(f ${_tmp_list}) - list(APPEND PADDLE_MOBILE_H ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.cpp) - foreach(f ${_tmp_list}) - list(APPEND PADDLE_MOBILE_CC ${f}) - endforeach() - - endif() -else() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - - file(GLOB_RECURSE _tmp_list_h src/operators/kernel/fpga/*.h) - foreach(f ${_tmp_list_h}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - - - file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - - file(GLOB_RECURSE _tmp_list_h src/fpga/*.h) - foreach(f ${_tmp_list_h}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() -endif() - -if(ANDROID_NDK_TOOLCHAIN_INCLUDED) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog") -else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp) - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h) -endif() - -if(IS_IOS) -else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm) - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h) -endif () - -set(CMAKE_VERBOSE_MAKEFILE ON) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build) - -# NET default -if(FPGAV1) - set(NET "FPGA_NET_V1" CACHE STRING "select net type") -elseif(FPGAV2) - set(NET "FPGA_NET_V2" CACHE STRING "select net type") -elseif(FPGAKD) - set(NET "FPGA_OPS_KD" CACHE STRING "select net type") -else() - set(NET "default" CACHE STRING "select net type") -endif() - -set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGA_NET_V1" "FPGA_NET_V2" "NLP" "op") -include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake") - -# build library -if(ANDROID_NDK_TOOLCHAIN_INCLUDED) - list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS) - add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) -elseif(IS_IOS) - if(USE_OPENMP) - add_library(paddle-mobile-stage0 STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) - add_custom_target(paddle-mobile ALL - COMMAND libtool -static -o ${CMAKE_BINARY_DIR}/libpaddle-mobile.a ${CMAKE_CURRENT_LIST_DIR}/tools/libomp.a $ - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - DEPENDS paddle-mobile - ) - add_dependencies(paddle-mobile paddle-mobile-stage0) - else() - add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) - endif() -else() - add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) -endif() - -# unit test -if(WITH_TEST AND WITH_SYMBOL) - if(IS_IOS) - else() - add_subdirectory(test) - endif() -elseif(FPGA) - add_subdirectory(test) -endif() - -# # if you want to combine third party static librares into paddle mobile so, please uncomment this code block -# target_link_libraries( -# paddle-mobile -# -Wl,--whole-archive -# "path_to_third_party_static_library" -# -Wl,--no-whole-archive -# ) diff --git a/mobile/CONTRIBUTING.md b/mobile/CONTRIBUTING.md deleted file mode 100644 index faed8edf8e..0000000000 --- a/mobile/CONTRIBUTING.md +++ /dev/null @@ -1,234 +0,0 @@ -# 贡献代码 - -欢迎您对Paddle-Mobile项目的贡献。 -我们诚挚的感谢你的贡献,这个文档描述了我们的工作方式和工作流程。Paddle-Mobile在PaddlePaddle org下,和服务器版本的Paddle工程的代码规范基本相同,开发者也可以同时参考Paddle的相关文档。 - -## Workflow - -Paddle-Mobile 开发中使用到的几种模型在这个链接下载 [点我](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip). -之后是贡献代码的主要流程。 - -### Fork - -* Paddle-Mobile采用Pull Request的方式提交代码,禁止直接push,所有的代码都需要人工review。首先要fork一份Paddle-Moble的代码 ["Fork" button](https://help.github.com/articles/fork-a-repo/). -* 跳转到[Paddle-Mobile](https://github.com/PaddlePaddle/paddle-mobile) GitHub首页,然后单击 `Fork` 按钮,生成自己目录下的仓库,比如 。 - -### Clone(克隆) -将远程仓库 clone 到本地: - -```bash -➜ git clone https://github.com/你的用户名/paddle-mobile -➜ cd Paddle -``` - -### 创建本地分支 - -Paddle-Mobile 和Paddle一样,目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发,测试,发行和维护,具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。 - -所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成,一般从 `develop` 分支上创建新分支。 - -使用 `git checkout -b` 创建并切换到新分支。 - -```bash -➜ git checkout -b my-cool-stuff -``` - -值得注意的是,在 checkout 之前,需要保持当前分支目录 clean,否则会把 untracked 的文件也带到新分支上,这可以通过 `git status` 查看。 - -### 使用 `pre-commit` 钩子 - -Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码(C++,Python),在提交(commit)前自动检查一些基本事宜(如每个文件只有一个 EOL,Git 中不要添加大文件等)。 - -`pre-commit`测试是 Travis-CI 中单元测试的一部分,不满足钩子的 PR 不能被提交到 Paddle,首先安装并在当前目录运行它: - -```bash -pip install pre-commit -pre-commit -v -a -``` - -Paddle-Mobile 使用 `clang-format` 来调整 C/C++ 源代码格式,在格式化代码时不同的`clang-format`版本会有不同的表现形态,和Paddle不同的是,Paddle-Mobile开发人员使用的是更的5.0版本的llvm工具集。所以为了防止无法CI,请确保 `clang-format` 版本是 5.0 版本。 - -> 另外:通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的,Paddle 开发人员使用的是`pip install pre-commit`。 - - - -## 开始开发 - -在本例中,我删除了 README.md 中的一行,并创建了一个新文件。 - -通过 `git status` 查看当前状态,这会提示当前目录的一些变化,同时也可以通过 `git diff` 查看文件具体被修改的内容。 - -```bash -➜ git status -On branch test -Changes not staged for commit: - (use "git add ..." to update what will be committed) - (use "git checkout -- ..." to discard changes in working directory) - - modified: README.md - -Untracked files: - (use "git add ..." to include in what will be committed) - - test - -no changes added to commit (use "git add" and/or "git commit -a") -``` - -## 构建 - -paddle-mobile是为了移动端版本开发的,而移动端大多以arm平台为主。所以我们要交叉编译到arm平台。以cpu为例: - -1. 安装NDK最新版 -2. 配置ANDROID_NDK和NDK_ROOT环境变量 -3. 开发,并写单元测试 -4. sh build.sh - -## 提交(commit) - -接下来我们取消对 README.md 文件的改变,然后提交新添加的 test 文件。 - -```bash -➜ git checkout -- README.md -➜ git status -On branch test -Untracked files: - (use "git add ..." to include in what will be committed) - - test - -nothing added to commit but untracked files present (use "git add" to track) -➜ git add test -``` - -Git 每次提交代码,都需要写提交说明,这可以让其他人知道这次提交做了哪些改变,这可以通过`git commit` 完成。 - -```bash -▶ pre-commit run -a -v -[remove-crlf] CRLF end-lines remover........................................Passed -[remove-tabs] Tabs remover..................................................Passed -[check-added-large-files] Check for added large files.......................Passed -[check-merge-conflict] Check for merge conflicts............................Passed -[check-symlinks] Check for broken symlinks..................................Passed -[detect-private-key] Detect Private Key.....................................Passed -[end-of-file-fixer] Fix End of Files........................................Passed -[trailing-whitespace] Trim Trailing Whitespace..............................Passed -[copyright] copyright.......................................................Passed -[clang-format] clang-format.................................................Passed -[cpplint] cpplint...........................................................Passed -hookid: cpplint - -Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh) -Done processing build_bak.sh -Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh) -Done processing build_bak.sh -``` - -## 保持本地仓库最新 - -在准备发起 Pull Request 之前,需要同步原仓库()最新的代码。 - -首先通过 `git remote` 查看当前远程仓库的名字。 - -```bash -➜ git remote -origin -➜ git remote -v -origin https://github.com/USERNAME/paddle-mobile (fetch) -origin https://github.com/USERNAME/paddle-mobile (push) -``` - -这里 origin 是我们 clone 的远程仓库的名字,也就是自己用户名下的 paddle-mobile,接下来我们创建一个原始 paddle-mobile 仓库的远程主机,命名为 upstream。 - -```bash -➜ git remote add upstream https://github.com/PaddlePaddle/paddle-mobile -➜ git remote -origin -upstream -``` - -获取 upstream 的最新代码并更新当前分支。 - -```bash -➜ git fetch upstream -➜ git pull upstream develop -``` - -## Push 到远程仓库 - -将本地的修改推送到 GitHub 上,也就是 https://github.com/USERNAME/paddle-mobile。 - -```bash -# 推送到远程仓库 origin 的 my-cool-stuff 分支上 -➜ git push origin my-cool-stuff -``` - -## 建立 Issue 并完成 Pull Request - -建立一个 Issue 描述问题,并记录它的编号。 - -切换到所建分支,然后点击 `New pull request`。 - -在 PR 的描述说明中,填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后,自动关闭对应的 Issue -> 具体请见 - - -## review - -在接到PR后,可以看到该pr页面内正在运行CI。如果运行出现问题,可以点Details进入Travis平台上看详细内容。 -![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833030073.jpg) - -可以在travis上看到更加详细的信息。 -![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833651326.jpg) - -接下来等待 review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。 - -![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg) -之后就可以提交代码了 - -## 删除远程分支 - -在 PR 被 merge 进主仓库后,我们可以在 PR 的页面删除远程仓库的分支。 - -screen shot 2017-04-26 at 9 18 24 pm - -也可以使用 `git push origin :分支名` 删除远程分支,如: - -```bash -➜ git push origin :my-cool-stuff -``` - -## 删除本地分支 - -最后,删除本地分支。 - -```bash -# 切换到 develop 分支 -➜ git checkout develop - -# 删除 my-cool-stuff 分支 -➜ git branch -D my-cool-stuff -``` - -至此,我们就完成了一次代码贡献的过程。 - -## 提交代码的一些约定 - -为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定: - -1. 请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,评审人一般不做评审。 -2. 提交Pull Request前: - - 请注意commit的数量: - - 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。 - - 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。 - - 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。 -3. 如果解决了某个Issue的问题,请在该Pull Request的**第一个**评论框中加上:`fix #issue_number`,这样当该Pull Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。 - -此外,在回复评审人意见时,请您遵守以下约定: - -1. 评审人的每个意见都必须回复(这是开源社区的基本礼貌,别人帮了忙,应该说谢谢): - - 对评审意见同意且按其修改完的,给个简单的`Done`即可; - - 对评审意见不同意的,请给出您自己的反驳理由。 -2. 如果评审意见比较多: - - 请给出总体的修改情况。 - - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。 diff --git a/mobile/Dockerfile b/mobile/Dockerfile deleted file mode 100644 index b9fc9ed45c..0000000000 --- a/mobile/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM ubuntu:16.04 - -RUN echo '\ -deb main restricted universe multiverse\n\ -deb -updates main restricted universe multiverse\n\ -deb -backports main restricted universe multiverse\n\ -deb -security main restricted universe multiverse\n'\ -> /etc/apt/sources.list -RUN sed -ie 's||http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list -RUN sed -ie 's||xenial|' /etc/apt/sources.list - -RUN apt-get update && apt-get upgrade -y -RUN apt-get install -y --no-install-recommends \ - curl \ - unzip \ - git \ - make \ - cmake-curses-gui \ - python \ - python-pip \ - python-setuptools \ - clang-format-5.0 \ - graphviz \ - g++-arm-linux-gnueabi \ - gcc-arm-linux-gnueabi -RUN apt-get autoremove -y && apt-get clean -RUN ln -s clang-format-5.0 /usr/bin/clang-format -RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip -RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel -RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit -RUN cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip -RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \ - tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \ - mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \ - mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \ - mv /usr/bin/ccmake /usr/bin/ccmake.bak && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake -RUN cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip -ENV NDK_ROOT /opt/android-ndk-r17c diff --git a/mobile/LICENSE b/mobile/LICENSE deleted file mode 100644 index e95626c0e4..0000000000 --- a/mobile/LICENSE +++ /dev/null @@ -1,204 +0,0 @@ -Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - diff --git a/mobile/README.md b/mobile/README.md deleted file mode 100644 index aa948a7ba7..0000000000 --- a/mobile/README.md +++ /dev/null @@ -1,137 +0,0 @@ -# Paddle-Mobile - -[![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc) -[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) - - - -Welcome to Paddle-Mobile GitHub project。Paddle-Mobile is a project of PaddlePaddle as well as a deep learning framework for embedded platforms. - -欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平台的深度学习的框架。 - -## Features - -- high performance in support of ARM CPU -- support Mali GPU -- support Andreno GPU -- support the realization of GPU Metal on Apple devices -- support implementation on ZU5、ZU9 and other FPGA-based development boards -- support implementation on Raspberry Pi and other arm-linux development boards - -## Features - -- 高性能支持ARM CPU -- 支持Mali GPU -- 支持Andreno GPU -- 支持苹果设备的GPU Metal实现 -- 支持ZU5、ZU9等FPGA开发板 -- 支持树莓派等arm-linux开发板 - - -## Demo -- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo) - -### 原Domo目录 - -[https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile/demo](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile/demo) - -## Documentation - -### Documentation of design - -If you want to know more details about the documentation of paddle-mobile design, please refer to the link as follows. There are many previous designs and discussion: [issue](https://github.com/PaddlePaddle/Paddle-Lite/issues). - -[link of documentation of design](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/design_doc.md) - -### Documentation of development - -Documentation of development is mainly about building, running and other tasks.As a developer,you can use it with the help of contributed documents. -* [iOS](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_ios.md) -* [Android_CPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android.md) -* [Android_GPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android_GPU.md) -* [FPGA](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_fpga.md) -* [ARM_LINUX](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_arm_linux.md) - -### How to contribute your documents -- [tutorial link to contribute documents](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/CONTRIBUTING.md) -- Main procedure of contributing code is covered in the document above.If you have other problems during the procedure,please send them as [issue](https://github.com/PaddlePaddle/Paddle-Lite/issues). We will deal with it as quickly as possible. - -## 文档 - -### 设计文档 - -关于paddle-mobile设计文档在下面链接中,如果想了解更多内容。[issue](https://github.com/PaddlePaddle/Paddle-Lite/issues)中会有很多早期的设计和讨论过程。 -[设计文档链接](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/design_doc.md) - -### 开发文档 - -开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。 -* [iOS](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_ios.md) -* [Android_CPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android.md) -* [Android_GPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android_GPU.md) -* [FPGA](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_fpga.md) -* [ARM_LINUX](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_arm_linux.md) - -### 贡献文档 -- [贡献文档链接](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/CONTRIBUTING.md) -- 上面文档中涵盖了主要的贡献代码流程,如果在实践中您还遇到了其他问题,可以发[issue](https://github.com/PaddlePaddle/Paddle-Lite/issues)。我们看到后会尽快处理。 - -## Acquision of Models -At present Paddle-Mobile only supports Paddle fluid training model. Models wiil be operated regularly after transformation if you have various models. -### 1. Use Paddle Fluid directly to train -It is the most reliable method to be recommanded -### 2. Transform Caffe to Paddle Fluid model -[caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/caffe2fluid) -### 3. ONNX -ONNX is expanded as Open Neural Network Exchange. The project is aimed to make a full communication and usage among diffrent nerual network development frameworks. - -Except for directly using fluid models trained by PaddlePaddle,you can also get certain Paddle fluid models through onnx transformation. - -At present,work in support of onnx is also under operation in Baidu. Related tranformation project can be referred to here: -[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx) - -### 4. Download parts of testing models and testing pictures -[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip) - -- input data generated by tools from `tools/python/imagetools`. - - -## 模型获得 -目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型,需要进行模型转换才可以运行。 -### 1. 直接使用Paddle Fluid训练 -该方式最为可靠,推荐方式 -### 2. caffe转为Paddle Fluid模型 -[caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/caffe2fluid) -### 3. ONNX -ONNX全称为“Open Neural Network Exchange”,即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。 - -除直接使用PaddlePaddle训练fluid版本的模型外,还可以通过onnx转换得到个别Paddle fluid模型。 - -目前,百度也在做onnx支持工作。相关转换项目在这里: -[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx) - -### 4. 部分测试模型和测试图片下载 -[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip) - -- 测试输入数据可由本仓库下的脚本`tools/python/imagetools`生成。 - -## Communication -- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc. -- QQ discussion group: 696965088 (Paddle-Mobile). -- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. - -## 交流与反馈 -- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议 -- QQ群: 696965088 (Paddle-Mobile) -- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围 - -## Old version Mobile-Deep-Learning -Original MDL(Mobile-Deep-Learning) project has been transferred to [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) - -## 旧版 Mobile-Deep-Learning -原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) - -## Copyright and License -[Apache-2.0 license](LICENSE). diff --git a/mobile/benchmark/arm_benchmark.md b/mobile/benchmark/arm_benchmark.md deleted file mode 100644 index aacbf3ef05..0000000000 --- a/mobile/benchmark/arm_benchmark.md +++ /dev/null @@ -1,36 +0,0 @@ -|mobilenet arm v7|1线程|2线程|4线程| -|------------|----|-----|-----| -|麒麟970(ms)|108.180|63.935|37.545| -|麒麟960(ms)|108.588|63.073|36.822| -|高通845(ms)|85.952|48.890|28.641| -|高通835(ms)|105.434|62.752|37.131| -||||| -|mobilenetssd arm v7|1线程|2线程|4线程| -|麒麟970(ms)|212.686|127.205|77.485| -|麒麟960(ms)|212.641|125.338|75.250| -|高通845(ms)|182.863|95.671|56.857| -|高通835(ms)|213.849|127.717|77.006| -||||| -|googlenet(v1) arm v7|1线程|2线程|4线程| -|麒麟970(ms)|335.288|234.559|161.295| -|麒麟960(ms)|354.443|232.642|157.815| -|高通845(ms)|282.007|173.146|122.148| -|高通835(ms)|341.250|233.354|158.554| -||||| -|squeezenet arm v7|1线程|2线程|4线程| -|麒麟970(ms)|83.726|57.944|36.923| -|麒麟960(ms)|85.835|55.762|36.496| -|高通845(ms)|71.301|41.618|28.785| -|高通835(ms)|82.407|56.176|36.455| -||||| -|yolo arm v7|1线程|2线程|4线程| -|麒麟970(ms)|129.658|79.993|49.969| -|麒麟960(ms)|130.208|78.791|48.390| -|高通845(ms)|109.244|61.736|40.600| -|高通835(ms)|130.402|80.863|50.359| - - 测试机型信息: - 麒麟970:荣耀v10 (2.36GHz * 4 + 1.8GHz * 4) - 麒麟960:华为mate9 (2.36GHz * 4 + 1.8GHz * 4) - 骁龙835:小米6 (2.45GHz * 4 + 1.9GHz * 4) - 骁龙845:OPPO FindX (2.80GHz * 4 + 1.8GHz * 4) diff --git a/mobile/benchmark/metal_benchmark.md b/mobile/benchmark/metal_benchmark.md deleted file mode 100644 index 2ffa7a00af..0000000000 --- a/mobile/benchmark/metal_benchmark.md +++ /dev/null @@ -1,10 +0,0 @@ -|mobilenetfssd|速度| -|------------|-----| -|A9(ms)|33.78| -|A10(ms)|24.05| -|A11(ms)|17.15| -||| -|genet|速度| -|A9(ms) |3.49| -|A10(ms)|2.54| -|A11(ms)|1.43| diff --git a/mobile/demo/ReadMe.md b/mobile/demo/ReadMe.md deleted file mode 100644 index c6d7b3def9..0000000000 --- a/mobile/demo/ReadMe.md +++ /dev/null @@ -1,10 +0,0 @@ -## Demo 下载路径 -- [ANDROID](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip) - -- [IOS](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip) - -- 原demo亦可使用getDemo.sh进行下载 - -``` -sh getDemo.sh -``` diff --git a/mobile/demo/getDemo.sh b/mobile/demo/getDemo.sh deleted file mode 100644 index 37662a2f4e..0000000000 --- a/mobile/demo/getDemo.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip -wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip -unzip paddle-mobile%2FPaddleMobile_Android.zip -unzip paddle-mobile%2FPaddleMobileDemo_iOS.zip -rm -rf paddle-mobile%2FPaddleMobile_Android.zip -rm -rf paddle-mobile%2FPaddleMobileDemo_iOS.zip -rm -rf __MACOSX diff --git a/mobile/doc/build.md b/mobile/doc/build.md deleted file mode 100644 index 0aaaccd031..0000000000 --- a/mobile/doc/build.md +++ /dev/null @@ -1,63 +0,0 @@ -# 环境搭建 -## 使用 docker -### 1. 安装 docker -安装 docker 的方式,参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/) -### 2. 使用 docker 搭建构建环境 -首先进入 paddle-mobile 的目录下,执行 `docker build` -以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行) -``` -$ docker build -t paddle-mobile:dev - < Dockerfile -``` -使用 `docker images` 可以看到我们新建的 image -``` -$ docker images -REPOSITORY TAG IMAGE ID CREATED SIZE -paddle-mobile dev 33b146787711 45 hours ago 372MB -``` -### 3. 使用 docker 构建 -进入 paddle-mobile 目录,执行 docker run -``` -$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev -root@5affd29d4fc5:/ # cd /paddle-mobile -### -### paddle-mobile 支持 arm 架构下的各种平台,包括 android 以及 linux 等,可以使用不同的 -### toolchain 文件生成满足需要的 makefile -### -# 生成构建 android 产出的 Makefile -root@5affd29d4fc5:/ # rm CMakeCache.txt -root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake - -# 生成构建 linux 产出的 Makefile -root@5affd29d4fc5:/ # rm CMakeCache.txt -root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake -``` -### 4. 设置编译选项 -可以通过 ccmake 设置编译选项 -``` -root@5affd29d4fc5:/ # ccmake . - Page 1 of 1 - CMAKE_ASM_FLAGS - CMAKE_ASM_FLAGS_DEBUG - CMAKE_ASM_FLAGS_RELEASE - CMAKE_BUILD_TYPE - CMAKE_INSTALL_PREFIX /usr/local - CMAKE_TOOLCHAIN_FILE /paddle-mobile/tools/toolchains/arm-android-neon.cmake - CPU ON - DEBUGING ON - FPGA OFF - LOG_PROFILE ON - NET googlenet - USE_EXCEPTION ON - USE_OPENMP OFF -``` -修改选项后,按 `c`, `g` 更新 Makefile -### 5. 构建 -使用 make 命令进行构建 -``` -root@5affd29d4fc5:/ # make -``` -### 6. 查看构建产出 -构架产出可以在 host 机器上查看,在 paddle-mobile 的目录下,build 以及 test/build 下,可以使用 adb 指令或者 scp 传输到 device 上执行 - -## 不使用 docker -不使用 docker 的方法,可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具,可能需要设置 CC,CXX 环境变量,或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake,或者增加自己需要的 toolchain file。 diff --git a/mobile/doc/design_doc.md b/mobile/doc/design_doc.md deleted file mode 100644 index 1e23efd52c..0000000000 --- a/mobile/doc/design_doc.md +++ /dev/null @@ -1,171 +0,0 @@ -# paddle-mobile 设计文档 - - -#### 以下是 paddle-mobile 代码的执行流程图: - -![执行流程图](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/flow_chart.png) - - -#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块 - -#### 下面展开说一下各个模块的作用以及设计思路 - -### 一. Loader -先来看一下模型, 模型分为两种结构: - 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件 - -![模型描述](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc.png) - - -另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件 - -![模型描述combined](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc_combined.png) - - -loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu). -方便进行算法优化. - -__那么为什么融合在一起能够做算法优化 ?__ - -如果未融合的 conv add batchnorm relu 运算是这样的 - -``` -[n] -[conv_res] = conv([n]) - -for &res in conv_res { - res = add_biase(res) -} - -for &res in conv_res { - res = batchnorm(res) -} - -for &res in conv_res { - res = relu(res) -} - -``` -融合后的 conv\_add\_batchnorm\_relu 运算是这样的: - -``` -[n] -[conv_res] = conv([n]) - -for &res in conv_res { - res = relu(batchnorm(add_biase(res))) -} - -``` -由于 conv 可以转换为两个大矩阵相乘, 更进一步可以分为若干个一行一列的小矩阵相乘, 那最终的运算是这样的: - -``` -[n] -for &res in [res] { - res = relu(batchnorm(add_biase(A * B))) -} - -其中 A 和 B 为 1 * k 和 k * 1 矩阵 - -``` - - - -### 二. Program - -program 为 loader 模块的结果, 包含了优化前的模型结构对象, 以及优化后的模型结构对象, 此模块基本对应着 paddle 模型的结构, 关于paddle 模型的一些概念的定义, 详细设计可以参考 [program.md](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), 以下是一个简单的概况: - -* programDesc 中包含着若干个(googlenet mobilenet yolo squeezenet resnet 常见的模型只有一个)可以嵌套的 block, blocks中的第一个block中的某个 op 可能会执行 blocks 中后边 block 中的一系列 op 运算(只有多个block才会有此概念) -* block 包含着 ops 和 vars -* ops 为一系列 op 的描述, 描述着每个 op 的类型, 输入输出, 所需参数 -* vars 里包含的为所有 op 运算所需的参数描述 - -### 三. Executor - -executor 主要是用于 op 运算的上层调度操作, 主要有两个操作, executor 实例化 和 暴露给上层的 predict 方法 - -* executor 实例化过程中, 主要进行了这几个操作 - 1. 根据 loader 产出的 program 初始化 operator 对象 - 2. 分配所有需要用到的内存, 包括每个op 的输入输出, 权重参数, 目前模型的权重参数文件的内存格式为 NCHW, op 的输入输出中间矩阵参数也是 NCHW 格式 - 3. 调用每个 op 的 init 方法, init 方法是每个 op 实现者进行参数预处理的地方, 有助于减少 predict 的耗时 - -* predict, 主要用于拿到外部的输入, 顺序调用 op 的 run 方法进行运算, 并返回最终的结果. - - -### 四. op -关于 op 模块代码的详细设计可以参考 [operator部分代码设计](https://github.com/PaddlePaddle/paddle-mobile/issues/300), operator主要包含一个kernel用于运算、一个 param 用于存储属性, operator 主要有三个操作, Init、RunImp、InferShape - -* Init: Init 函数主要用于参数预处理, 如对 batchNorm 参数进行预处理, 可以将 batchNorm 运算转化为 a * x + b 形式的运算, 这个函数也会调用, kernel 的 Init 函数对 kernel 进行初始化 -* RunImp: RunImp 函数会调用自己的kernel 的 compute 方法进行运算 -* InferShape: InferShape 函数会根据输入和参数得出输出的形状, 这个函数会在 executor 实例化时, 内存初始化前调用 - -每个 operator 都需要进行注册才可以被使用, 以 conv 为例, 需在 conv_op.cpp 底部这样写: - -```c++ -// 三个平台都注册了 conv op -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -USE_OP_CPU(conv2d); -REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -USE_OP_FPGA(conv2d); -REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp); -#endif - -``` - -__一个关于包大小的优化__: - -每个 operator 都由一个宏控制编译, 如 conv_op.h(除了 conv_op.h , conv_op.cpp、conv_kernle.h、conv_kernle.cpp 也都需要加此宏控制) - -```c++ - -#ifdef CONV_OP //这个宏控制着 conv_op 是否被编译, 除了 conv_op.h , conv_op.cpp、conv_kernle.h conv_kernle.cpp 也都需要加此宏控制 - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/conv_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class ConvOp - //impl -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif - -``` -这样做的目的是为了根据不同类型的网络编译特定的op, 在 cmake 中已经配置好不同网络编译的宏, 如果你要进行编译支持 yolo 的模型, 仅需执行: - -```sh -cd toools -sh build.sh android yolo - -``` -这样只会编译 yolo 所包含的四种 op, 极大的减小了包体积和编译时间 - -### 五. kernel -kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示: - -![设备特化](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/devices.png) - -不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现. - -__如果你有兴趣新增一个协处理器实现, 就可以在次添加一个 kernel 目录, 提供协处理器实现, 如果某个 kernel 你没有实现完, 你也可以直接使用 arm 实现__ - -### 六. scope variable Tensor -* scope 用来存储管理所需用到的所有 variable(用来存储不同类型的对象, 主要是矩阵Tensor, 也就是说 scpoe 管理着 op 运算过程中所有参数矩阵, 输入输出矩阵), 可以将 scope 理解为一个 map, 这里在 map 上封了一层 scope 的概念是为了方便内存管理 -* variable 可以用来存储不同类型的对象, paddle-mobile 里主要用它来存储矩阵 Tensor -* tensor 代表着矩阵, 通过泛型可以用来存储不同类型的矩阵, 但需要注意的是, 存入和取出时的类型必须保持一致, 如果类型不一致, 使用 inline const T \*data() const 获取指针会不能通过类型检查, 通过 inline T \*mutable_data() 获取指针会重新分配内存, 以下是关于 Tensor 的一些小概念: - 1. DDim: 用来存储矩阵的维度信息. - 2. Slice(): 这个函数用来获取 N 维 (NCHW中的 N) 上切片 - 3. 当实例化未分配内存时, 调用 inline T *mutable_data() 会分配内存 diff --git a/mobile/doc/development_android.md b/mobile/doc/development_android.md deleted file mode 100644 index c7574eb55e..0000000000 --- a/mobile/doc/development_android.md +++ /dev/null @@ -1,189 +0,0 @@ -# Android开发文档 - -用户可通过如下两种方式进行编译: - -- 基于macOS 、Linux交叉编译 -- 基于Docker容器编译 - -## 基于macOS 、Linux交叉编译 - -需要: NDK17及以上、cmake 3.0及以上 - -### 执行编译 - -在paddle-mobile根目录中,执行以下命令: - -```shell - -cd tools -sh build.sh android - -# 如果想编译只支持某些特定网络的库 (可以控制包体积, 编译出来的库就只包含了支持这些特定模型的算子), 可以使用 - -sh build.sh android mobilenet googlenet - -# 当然这些网络是需要在 cmakelist 中配置的(https://github.com/PaddlePaddle/paddle-mobile/blob/73769e7d05ef4820a115ad3fb9b1ca3f55179d03/CMakeLists.txt#L216), 目前配置了几个常见模型 - -``` - -执行完毕后,生成的`so`位于`build/release/`目录中: - -- jni 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni) -- c++ 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h) - -单测可执行文件位于`test/build`目录中。 - -如果有环境问题, 可以看接下来的环节 - -### 环境配置 - -##### 下载Android NDK - -如果你的电脑安装了Android Studio, 可以在 Android Studio 中直接下载安装`NDK`或者可以在 [https://developer.android.com/ndk/](https://developer.android.com/ndk/) 这里自行下载,也可以通过以下命令获取: - -- Mac平台 - -```shell -wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip -unzip android-ndk-r17b-darwin-x86_64.zip -``` - -- Linux平台 - -```shell -wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip -unzip android-ndk-r17b-linux-x86_64.zip -``` - -##### 设置环境变量 -工程中自带的独立工具链会根据环境变量`NDK_ROOT`查找NDK,因此需要配置环境变量: - -```shell -export NDK_ROOT = "path to ndk" -``` - -##### 安装 CMake - -- Mac平台 - -mac 平台下可以使用`homebrew`安装 - -```shell -brew install cmake -``` - -- Linux平台 - -linux 下可以使用`apt-get`进行安装 - -```shell -apt-get install cmake - -``` - -##### Tips: -如果想要获得体积更小的库,可选择编译支持指定模型结构的库。 -如执行如下命令: - -```shell -sh build.sh android googlenet -``` - -会得到一个支持googlnet的体积更小的库。 - -## 基于Docker容器编译 - -### 1. 安装 docker - -安装 docker 的方式,参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/) - -### 2. 使用 docker 搭建构建环境 - -首先进入 paddle-mobile 的目录下,执行 `docker build` -以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行) - -```shell -$ docker build -t paddle-mobile:dev - < Dockerfile -``` -使用 `docker images` 可以看到我们新建的 image - -```shell -$ docker images -REPOSITORY TAG IMAGE ID CREATED SIZE -paddle-mobile dev 33b146787711 45 hours ago 372MB -``` -### 3. 使用 docker 构建 -进入 paddle-mobile 目录,执行 docker run - -```shell -$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev -root@5affd29d4fc5:/ # cd /paddle-mobile -# 生成构建 android 产出的 Makefile -root@5affd29d4fc5:/ # rm CMakeCache.txt -root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake -# 生成构建 linux 产出的 Makefile -root@5affd29d4fc5:/ # rm CMakeCache.txt -root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake -``` -### 4. 设置编译选项 - -可以通过 ccmake 设置编译选项 - -``` -root@5affd29d4fc5:/ # ccmake . - Page 1 of 1 - CMAKE_ASM_FLAGS - CMAKE_ASM_FLAGS_DEBUG - CMAKE_ASM_FLAGS_RELEASE - CMAKE_BUILD_TYPE - CMAKE_INSTALL_PREFIX /usr/local - CMAKE_TOOLCHAIN_FILE /paddle-mobile/tools/toolchains/arm-android-neon.cmake - CPU ON - DEBUGING ON - FPGA OFF - LOG_PROFILE ON - MALI_GPU OFF - NET googlenet - USE_EXCEPTION ON - USE_OPENMP OFF -``` -修改选项后,按 `c`, `g` 更新 Makefile -### 5. 构建 -使用 make 命令进行构建 - -``` -root@5affd29d4fc5:/ # make -``` -### 6. 查看构建产出 - -构架产出可以在 host 机器上查看,在 paddle-mobile 的目录下,build 以及`test/build`下,可以使用`adb`指令或`scp`传输到`device`上执行 - -## 测试 - -在编译完成后,我们提供了自动化的测试脚本,帮助用户将运行单测文件所需要的模型及库文件push到Android设备 - -执行下面的脚本,该脚本会下载测试需要的 [mobilenet和test_image_1x3x224x224_float(预处理过的 NCHW 文件) 文件](http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip),在项目下的`test`目录创建模型和图片文件夹,并将`mobilenet`复制到`paddle-mobile/test/models`目录下,将`test_image_1x3x224x224_float`复制到`paddle-mobile/test/images`目录下 - - -```shell -cd tools -sh ./prepare_images_and_models.sh -``` - -* 执行下面命令将可执行文件和预测需要的文件部署到手机 - -```shell -cd tools/android-debug-script -sh push2android.sh -``` - -* mobilenet cpu模型预测结果 - -假设mobilenet和`test_image_1x3x224x224_float`文件已经推送到手机上,执行下面命令进行mobilenet cpu的预测 - -```shell -adb shell -cd /data/local/tmp/bin/ -export LD_LIBRARY_PATH=. -./test-mobilenet -``` diff --git a/mobile/doc/development_android_GPU.md b/mobile/doc/development_android_GPU.md deleted file mode 100644 index a3fb7dd1dd..0000000000 --- a/mobile/doc/development_android_GPU.md +++ /dev/null @@ -1,77 +0,0 @@ -## paddle-mobile GPU开发文档 - -编译环境配置方法请参考`development_android.md`文档 - -1. 下载 paddle-mobile - -```shell -git clone https://github.com/PaddlePaddle/paddle-mobile.git - -adb pull /system/vendor/lib/libOpenCL.so paddle-mobile/third_party/opencl - -# 修改paddle-mobile/CMakeLists.txt文件,执行如下操作: -# option(GPU_CL "opencl gpu" OFF)->option(GPU_CL "opencl gpu" ON) - -cd paddle-mobile/tools -sh build.sh android -``` - -2. 将单测可执行文件和模型部署到手机 - -执行下面的脚本,该脚本会下载测试需要的 [mobilenet和test_image_1x3x224x224_float(预处理过的 NCHW 文件) 文件](http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip),在项目下的`test`目录创建模型>和图片文件夹,并将`mobilenet`复制到`paddle-mobile/test/models`目录下,将`test_image_1x3x224x224_float`复制到`paddle-mobile/test/images`目录下 - -```shell -cd tools -sh ./prepare_images_and_models.sh -``` - -执行下面命令将可执行文件和预测需要的文件部署到手机 - -```shell -cd ../tools/android-debug-script -sh push2android.sh -``` - -3. 在`adb shell`中执行对应的可执行文件(目前只支持mobilenet,后续会支持更多的网络模型) - -```shell -adb shell -cd /data/local/tmp/bin/ -export LD_LIBRARY_PATH=. -./test-mobilenetgpu -``` - -4. mobilenet cpu模型预测结果 - -执行下面命令进行mobilenet cpu的预测 - -```shell -adb shell -cd /data/local/tmp/bin/ -export LD_LIBRARY_PATH=. -./test-mobilenet -``` - -5. 预测结果 - - 手机型号:小米6(CPU 835,GPU Adreno 540) - - mobilenet gpu:预测性能,耗时41ms左右。 - - mobilenet cpu: - - 1线程:108ms - 2线程:65ms - 4线程:38ms - - 手机型号:OPPO Findx(CPU 845,GPU Adreno 630) - - mobilenet gpu:预测性能,耗时27ms左右。 - - mobilenet cpu: - - 1线程:90ms - 2线程:50ms - 4线程:29ms - - 备注: GPU 在打开log之后, 会大幅增加性能开销,测试benchmark请关闭CmakeList中Log选项 diff --git a/mobile/doc/development_arm_linux.md b/mobile/doc/development_arm_linux.md deleted file mode 100644 index bdabd04223..0000000000 --- a/mobile/doc/development_arm_linux.md +++ /dev/null @@ -1,62 +0,0 @@ -# ARM Linux开发文档 - -在ARM Linux如Raspberrypi3,或Firefly-RK3399上编译paddle-mobile(**注:暂不支持ARM Linux GPU**)。 - -## 预先安装 - -```shell -$ sudo apt update -$ sudo apt-get install -y cmake git -$ git clone https://github.com/PaddlePaddle/paddle-mobile.git -``` - -## 编译 - -在paddle-mobile根目录中,执行以下命令: - -```shell -# 进入paddle-mobile根目录 -$ cd - -# 可选:开启GPU支持,在CMakeLists.txt开启GPU_CL选项为ON -$ cp /usr/lib/aarch64-linux-gnu/libMali.so ./third_party/opencl/ -$ cp /usr/lib/aarch64-linux-gnu/libOpenCL.so ./third_party/opencl/ -$ ln -s ./third_party/opencl/libMali.so ./third_party/opencl/ - -# 编译 -$ cd ./tools -$ /bin/bash build.sh arm_linux -``` - -- 动态库`so`文件位于`/build/release/arm-linux/build`目录; -- 单元测试位于`/test/build`目录,若只编译如`googlenet`,可以执行`bash build.sh arm_linux googlenet`。 - -## 运行 - -接着刚刚的命令,执行MobileNet模型: - -```shell -# 导入编译好的动态库路径到LD_LIBRARY_PATH中 -$ cd ../build/release/arm-linux/build -$ export LD_LIBRARY_PATH=. - -# 执行MobileNet -# 可选:GPU执行./test-mobilenetgpu -$ cd ../../../../test/build/ -$ ./test-mobilenet - -# 执行顺利会打印如下日志 -load cost :0ms - Max element is 0.985921 at position 954 -predict cost :121.462ms -如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana 是否存在? -``` - -注意: -1. 如果本地仓库中`test`目录下没有模型,脚本会自动下载官方demo模型并解压; -2. 因为ARM Linux设备算力限制,编译卡死重启机器尝试单线程编译(修改`tools/build.sh`中`build_for_arm_linux`的编译为`make -j`),或指定编译某个模型(如googlenet)或扩大系统的swap交换空间。 - -## 其它 - -- 若编译中提示有不识别的编译选项等ARM Linux平台的编译问题,可尝试修改`tools/build.sh`中的相关编译参数; -- Android平台请参考Android开发文档. diff --git a/mobile/doc/development_fpga.md b/mobile/doc/development_fpga.md deleted file mode 100644 index 4019739b45..0000000000 --- a/mobile/doc/development_fpga.md +++ /dev/null @@ -1,5 +0,0 @@ -# FPGA开发文档 - -FPGA平台的代码分为V1和V2。要复现V1运行的结果,需要准备专门的硬件、底层驱动程序、FPGA工程。这些都在之前的版本[1.1.1](https://github.com/PaddlePaddle/paddle-mobile/releases/tag/1.1.1) 中提供了链接。根据链接的使用说明,可以复现resnet50的推测结果。 - -后续PaddleMobile版本,不再提供相关的辅助文件。 diff --git a/mobile/doc/development_ios.md b/mobile/doc/development_ios.md deleted file mode 100644 index 1dbc7555e8..0000000000 --- a/mobile/doc/development_ios.md +++ /dev/null @@ -1,85 +0,0 @@ -# iOS开发文档 - -## CPU - -需要: xcode - -### 编译 - -```sh - -# 在 paddle-mobile 目录下: -cd tools - -sh build.sh ios - -# 如果只想编译某个特定模型的 op, 则需执行以下命令 -sh build.sh ios googlenet - -# 在这个文件夹下, 你可以拿到生成的 .a 库 -cd ../build/release/ios/build - -``` -#### 常见问题: - -1. No iOS SDK's found in default search path ... - - 这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, - 以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk") - -### 集成 - -``` -将上一步生成的: -libpaddle-mobile.a - -/src/ios_io/ 下的 -PaddleMobileCPU.h -``` -拖入工程 - -#### oc 接口 - -接口如下: - -``` -/* - 创建对象 -*/ -- (instancetype)init; - -/* - load 模型, 开辟内存 -*/ -- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath; - -/* - 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict -*/ -- (NSArray *)predict:(CGImageRef)image dim:(NSArray *)dim means:(NSArray *)means scale:(float)scale; - -/* - 进行预测 -*/ -- (NSArray *)predict:(CGImageRef)image dim:(NSArray *)dim; - -/* - 清理内存 -*/ -- (void)clear; - -``` - -## GPU - -需要: xcode、cocoapods - -``` -# 在 paddle-mobile 目录下: -cd metal - -pod install - -open paddle-mobile.xcworkspace - -``` diff --git a/mobile/doc/quantification.md b/mobile/doc/quantification.md deleted file mode 100644 index 4e851581ae..0000000000 --- a/mobile/doc/quantification.md +++ /dev/null @@ -1,33 +0,0 @@ -# Quantification 模型量化、反量化 - -## 背景故事 -部分网络如AlexNet训练出的模型体积较大,不适宜在移动设备上使用。 - - -## 解决模型过大办法 -1. 选用适合移动端的模型结构如:mobilenet、googlenet、 yolo、squeezenet 等; -2. 使用我们提供的量化工具,可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4; - -- - - - - -## 量化工具介绍 - -### 模型转化工具目录: - -- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification) - -- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp) - -#### 使用说明 -- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md) - -## 如何读取量化后的模型 -load方法中添加了 quantification 参数,默认为false。 如果需要load量化后的模型,按需传参即可。 - -[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h) - -```c++ -bool Load(const std::string &dirname, bool optimize = false, - bool quantification = false, int batch_size = 1); -``` - -- - - - - diff --git a/mobile/src/common/common.h b/mobile/src/common/common.h deleted file mode 100644 index c7a681f426..0000000000 --- a/mobile/src/common/common.h +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include // NOLINT - -namespace paddle_mobile { - -using Time = decltype(std::chrono::high_resolution_clock::now()); - -inline Time time() { return std::chrono::high_resolution_clock::now(); } - -inline double time_diff(Time t1, Time t2) { - typedef std::chrono::microseconds ms; - auto diff = t2 - t1; - ms counter = std::chrono::duration_cast(diff); - return counter.count() / 1000.0; -} - -} // namespace paddle_mobile diff --git a/mobile/src/common/enforce.h b/mobile/src/common/enforce.h deleted file mode 100644 index 9cabee989b..0000000000 --- a/mobile/src/common/enforce.h +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef ENABLE_EXCEPTION -#include -#include -#include -#include -#endif - -namespace paddle_mobile { - -#ifdef ENABLE_EXCEPTION -struct PaddleMobileException : public std::exception { - const std::string exception_prefix = "paddle mobile C++ Exception: \n"; - std::string message; - - PaddleMobileException(const char *header, const char *detail, - const char *file, const int line) { - char buffer[1500]; - snprintf(buffer, sizeof(buffer), - "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail] : %s\n", - exception_prefix.c_str(), header, file, line, detail); - message = std::string(buffer); - } - const char *what() const noexcept { return message.c_str(); } -}; - -#define PADDLE_MOBILE_THROW_EXCEPTION(...) \ - { \ - char buffer[1000]; \ - snprintf(buffer, sizeof(buffer), __VA_ARGS__); \ - throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \ - __FILE__, __LINE__); \ - } \ - exit(0); - -#define PADDLE_MOBILE_ENFORCE(stat, ...) \ - { \ - if (stat) { \ - } else { \ - char buffer[1000]; \ - snprintf(buffer, sizeof(buffer), __VA_ARGS__); \ - throw paddle_mobile::PaddleMobileException("paddle-mobile enforce", \ - buffer, __FILE__, __LINE__); \ - } \ - } -#else -#define PADDLE_MOBILE_THROW_EXCEPTION(...) - -#define PADDLE_MOBILE_ENFORCE(stat, ...) \ - { \ - if (stat) { \ - } else { \ - } \ - } - -#endif - -} // namespace paddle_mobile diff --git a/mobile/src/common/log.h b/mobile/src/common/log.h deleted file mode 100644 index 3b42188b62..0000000000 --- a/mobile/src/common/log.h +++ /dev/null @@ -1,283 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#ifdef PADDLE_MOBILE_DEBUG -#include -#include -#include -#include -#endif -#ifdef ANDROID -#include -#endif - -namespace paddle_mobile { - -#ifdef PADDLE_MOBILE_DEBUG - -#ifdef ANDROID - -static const char *ANDROID_LOG_TAG = - "paddle_mobile LOG built on " __DATE__ " " __TIME__; -#ifdef PADDLE_ENABLE_COLORABLE_LOG -#define PADDLE_RED "\033[1;31;40m" -#define PADDLE_GREEN "\033[1;32;40m" -#define PADDLE_YELLOW "\033[1;33;40m" -#define PADDLE_LIGHT_RED "\033[1;35;40m" -#define PADDLE_BLUE "\033[1;34;40m" -#define PADDLE_WHITE "\033[1;37;40m" -#define PADDLE_CONON "\033[0m" -#else -#define PADDLE_RED "" -#define PADDLE_GREEN "" -#define PADDLE_YELLOW "" -#define PADDLE_LIGHT_RED "" -#define PADDLE_BLUE "" -#define PADDLE_WHITE "" -#define PADDLE_CONON "" -#endif -#define ANDROIDLOGI(...) \ - __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \ - fprintf(stderr, PADDLE_YELLOW "%s\n" PADDLE_CONON, __VA_ARGS__); \ - fflush(stderr) -#define ANDROIDLOGW(...) \ - __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, __VA_ARGS__); \ - fprintf(stderr, PADDLE_LIGHT_RED "%s\n" PADDLE_CONON, __VA_ARGS__); \ - fflush(stderr) -#define ANDROIDLOGD(...) \ - __android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \ - fprintf(stderr, PADDLE_WHITE "%s\n" PADDLE_CONON, __VA_ARGS__); \ - fflush(stderr) -#define ANDROIDLOGE(...) \ - __android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \ - fprintf(stderr, PADDLE_RED "%s\n" PADDLE_CONON, __VA_ARGS__); \ - fflush(stderr) -#define ANDROIDLOGV(...) \ - __android_log_print(ANDROID_LOG_VERBOSE, ANDROID_LOG_TAG, __VA_ARGS__); \ - fprintf(stderr, PADDLE_GREEN "%s\n" PADDLE_CONON, __VA_ARGS__); \ - fflush(stderr) -#else -#define ANDROIDLOGI(...) -#define ANDROIDLOGW(...) -#define ANDROIDLOGD(...) -#define ANDROIDLOGE(...) -#define ANDROIDLOGV(...) - -#endif - -enum LogLevel { - kLOG_ERROR, - kLOG_WARNING, - kLOG_INFO, - kLOG_VERBOSE, - kLOG_DEBUG, - kLOG_DEBUG1, - kLOG_DEBUG2, - kLOG_DEBUG3, - kLOG_DEBUG4, - kNO_LOG, -}; - -// log level -static LogLevel log_level = kLOG_DEBUG4; - -static std::vector logs{"ERROR ", "WARNING", "INFO ", "VERBOSE", - "DEBUG ", "DEBUG1 ", "DEBUG2 ", "DEBUG3 ", - "DEBUG4 ", "NO "}; -struct ToLog; -struct Print; - -struct Print { - friend struct ToLog; - - template - Print &operator<<(T const &value) { - buffer_ << value; - return *this; - } - - private: - void print(LogLevel level) { - // buffer_ << std::endl; - if (level == kLOG_ERROR) { -#ifdef ANDROID - ANDROIDLOGE(buffer_.str().c_str()); -#else - std::cerr << buffer_.str() << std::endl; -#endif - } else if (level == kLOG_INFO) { -#ifdef ANDROID - ANDROIDLOGI(buffer_.str().c_str()); -#else - std::cerr << buffer_.str() << std::endl; -#endif - } else if (level == kLOG_VERBOSE) { -#ifdef ANDROID - ANDROIDLOGV(buffer_.str().c_str()); -#else - std::cerr << buffer_.str() << std::endl; -#endif - } else if (level == kLOG_WARNING) { -#ifdef ANDROID - ANDROIDLOGW(buffer_.str().c_str()); -#else - std::cerr << buffer_.str() << std::endl; -#endif - } else { -#ifdef ANDROID - ANDROIDLOGD(buffer_.str().c_str()); -#else - std::cout << buffer_.str() << std::endl; -#endif - } - } - std::ostringstream buffer_; -}; - -struct ToLog { - explicit ToLog(LogLevel level = kLOG_DEBUG, const std::string &info = "") - : level_(level) { - unsigned blanks = - (unsigned)(level > kLOG_DEBUG ? (level - kLOG_DEBUG) * 4 : 1); - printer_ << logs[level] << " " << info << ":" << std::string(blanks, ' '); - } - - template - ToLog &operator<<(T const &value) { - printer_ << value; - return *this; - } - - ~ToLog() { printer_.print(level_); } - - private: - LogLevel level_; - Print printer_; -}; - -#define LOG(level) \ - if (level > paddle_mobile::log_level) { \ - /* NOLINTNEXTLINE */ \ - } else \ - paddle_mobile::ToLog( \ - level, static_cast( \ - std::stringstream() \ - << "[file: " \ - << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \ - : __FILE__) \ - << "] [line: " << __LINE__ << "] ") \ - .str()) - -#define DLOG \ - if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \ - /* NOLINTNEXTLINE */ \ - } else \ - paddle_mobile::ToLog( \ - paddle_mobile::kLOG_DEBUG, \ - static_cast( \ - std::stringstream() \ - << "[file: " \ - << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \ - : __FILE__) \ - << "] [line: " << __LINE__ << "] ") \ - .str()) - -#define LOGF(level, format, ...) \ - if (level > paddle_mobile::log_level) { \ - /* NOLINTNEXTLINE */ \ - } else \ - printf(format, ##__VA_ARGS__) - -#define DLOGF(format, ...) \ - if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \ - /* NOLINTNEXTLINE */ \ - } else \ - printf(format, ##__VA_ARGS__) - -#else - -#define ANDROIDLOGI(...) -#define ANDROIDLOGW(...) -#define ANDROIDLOGD(...) -#define ANDROIDLOGE(...) -#define ANDROIDLOGV(...) - -enum LogLevel { - kLOG_ERROR, - kLOG_WARNING, - kLOG_INFO, - kLOG_VERBOSE, - kLOG_DEBUG, - kLOG_DEBUG1, - kLOG_DEBUG2, - kLOG_DEBUG3, - kLOG_DEBUG4, - kNO_LOG -}; - -struct ToLog; -struct Print { - friend struct ToLog; - template - Print &operator<<(T const &value) { - return *this; - } -}; - -struct ToLog { - explicit ToLog(LogLevel level) {} - - template - ToLog &operator<<(T const &value) { - return *this; - } -}; - -#define LOG(level) \ - if (true) { \ - /* NOLINTNEXTLINE */ \ - } else \ - paddle_mobile::ToLog(level) - -#define DLOG \ - if (true) { \ - /* NOLINTNEXTLINE */ \ - } else \ - paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG) - -#define LOGF(level, format, ...) - -#define DLOGF(format, ...) - -#endif - -template -Print &operator<<(Print &printer, const std::vector &v) { - printer << "[ "; - - for (int i = 0; i < v.size(); ++i) { - const auto &value = v[i]; - printer << value << " "; - if (i % 10 == 9) { - printer << "\n"; - } - } - printer << " ]"; - return printer; -} - -} // namespace paddle_mobile diff --git a/mobile/src/common/threadpool.h b/mobile/src/common/threadpool.h deleted file mode 100644 index bf7894dd94..0000000000 --- a/mobile/src/common/threadpool.h +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace paddle_mobile { -class ThreadPool { - public: - static ThreadPool& getThreadPool(); - static int getThreadPoolThreadId(); - explicit ThreadPool(size_t); - template - auto enqueue(F&& f, Args&&... args) - -> std::future::type>; - ~ThreadPool(); - int getTid(const std::thread::id& id) { - for (int i = 0; i < workers.size(); i++) { - if (workers[i].get_id() == id) { - return i; - } - } - return -1; - } - - private: - // need to keep track of threads so we can join them - std::vector workers; - // the task queue - std::queue> tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// the constructor just launches some amount of workers -inline ThreadPool::ThreadPool(size_t threads) : stop(false) { - for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this] { - for (;;) { - std::function task; - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait( - lock, [this] { return this->stop || !this->tasks.empty(); }); - // for (;;) { - // if (this->stop || !this->tasks.empty()) { - // break; - // } - // lock.unlock(); - // lock.lock(); - // } - if (this->stop && this->tasks.empty()) return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - }); -} - -// add new work item to the pool -template -auto ThreadPool::enqueue(F&& f, Args&&... args) - -> std::future::type> { - using return_type = typename std::result_of::type; - - auto task = std::make_shared>( - std::bind(std::forward(f), std::forward(args)...)); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - // if(stop) - // throw std::runtime_error("enqueue on stopped ThreadPool"); - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; -} - -// the destructor joins all threads -inline ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread& worker : workers) worker.join(); -} - -ThreadPool& ThreadPool::getThreadPool() { - static ThreadPool threadPool(3); - return threadPool; -} - -int ThreadPool::getThreadPoolThreadId() { - return getThreadPool().getTid(std::this_thread::get_id()); -} -} // namespace paddle_mobile diff --git a/mobile/src/common/type_define.h b/mobile/src/common/type_define.h deleted file mode 100644 index bedbd2a75e..0000000000 --- a/mobile/src/common/type_define.h +++ /dev/null @@ -1,187 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -namespace paddle_mobile { - -typedef enum { - _void = 0, - _float, - _int, - _uint16_t, - _double, - _int64_t, - _size_t, - _int16_t, - _int8_t, - _uint8_t, - _bool, - _string, - _floats = 100, - _ints, - _int64_ts, - _size_ts, - _bools, - _strings, - _const_float = 200, - _const_int, - _block = 300, - _tensor, - _lod_tensor, - _blocks, - _tensors, - _lod_tensors, - _p_block = 400, - _p_tensor, - _p_lod_tensor, - _p_blocks, - _p_tensors, - _p_lod_tensors, - _scopes = 500, - _selected_rows, - _dim0 = 600, - _dim1, - _dim2, - _dim3, - _dim4, - _dim5, - _dim6, - _dim7, - _dim8, - _dim9, -#ifdef PADDLE_MOBILE_CL - _cl_image, -#endif -} kTypeId_t; - -template -struct TypeIdWrapper { - inline std::string name(); - inline kTypeId_t hash_code(); -}; - -template -struct type_id { - const kTypeId_t hash_code() const { return TypeIdWrapper().hash_code(); } - const std::string name() const { return TypeIdWrapper().name(); } - - template - bool operator==(const type_id &operand) const { - return this->hash_code() == operand.hash_code(); - } -}; - -#define OVERIDE_TYPEID_OPERATOR(oprand) \ - template \ - inline bool operator oprand(const kTypeId_t &t0, const type_id &t1) { \ - return t0 oprand t1.hash_code(); \ - } \ - template \ - inline bool operator oprand(const type_id &t0, const kTypeId_t &t1) { \ - return t1 oprand t0.hash_code(); \ - } - -OVERIDE_TYPEID_OPERATOR(==) -OVERIDE_TYPEID_OPERATOR(!=) - -namespace framework { -class BlockDesc; -class Tensor; -class LoDTensor; -class SelectedRows; -class Scope; -#ifdef PADDLE_MOBILE_CL -class CLImage; -#endif - -template -struct Dim; -} // namespace framework - -#define REGISTER_TYPE_ID(Type, TypeName) \ - template <> \ - struct TypeIdWrapper { \ - inline std::string name() { return std::string(#TypeName); } \ - inline kTypeId_t hash_code() { return kTypeId_t::TypeName; } \ - }; - -REGISTER_TYPE_ID(void, _void) -REGISTER_TYPE_ID(float, _float) -REGISTER_TYPE_ID(int, _int) -REGISTER_TYPE_ID(uint16_t, _uint16_t) -REGISTER_TYPE_ID(double, _double) -REGISTER_TYPE_ID(int64_t, _int64_t) -REGISTER_TYPE_ID(size_t, _size_t) -REGISTER_TYPE_ID(int16_t, _int16_t) -REGISTER_TYPE_ID(int8_t, _int8_t) -REGISTER_TYPE_ID(uint8_t, _uint8_t) -REGISTER_TYPE_ID(bool, _bool) -REGISTER_TYPE_ID(std::string, _string) -REGISTER_TYPE_ID(std::vector, _floats) -REGISTER_TYPE_ID(std::vector, _ints) -REGISTER_TYPE_ID(std::vector, _int64_ts) -REGISTER_TYPE_ID(std::vector, _size_ts) -REGISTER_TYPE_ID(std::vector, _bools) -REGISTER_TYPE_ID(std::vector, _strings) - -REGISTER_TYPE_ID(float const, _const_float) -REGISTER_TYPE_ID(int const, _const_int) - -REGISTER_TYPE_ID(framework::BlockDesc, _block) -REGISTER_TYPE_ID(framework::Tensor, _tensor) -REGISTER_TYPE_ID(framework::LoDTensor, _lod_tensor) -REGISTER_TYPE_ID(std::vector, _blocks) -REGISTER_TYPE_ID(std::vector, _tensors) -REGISTER_TYPE_ID(std::vector, _lod_tensors) - -REGISTER_TYPE_ID(framework::BlockDesc *, _p_block) -REGISTER_TYPE_ID(framework::Tensor *, _p_tensor) -REGISTER_TYPE_ID(framework::LoDTensor *, _p_lod_tensor) -REGISTER_TYPE_ID(std::vector, _p_blocks) -REGISTER_TYPE_ID(std::vector, _p_tensors) -REGISTER_TYPE_ID(std::vector, _p_lod_tensors) - -REGISTER_TYPE_ID(std::vector, _scopes); -REGISTER_TYPE_ID(framework::SelectedRows, _selected_rows) -REGISTER_TYPE_ID(framework::Dim<0>, _dim0) -REGISTER_TYPE_ID(framework::Dim<1>, _dim1) -REGISTER_TYPE_ID(framework::Dim<2>, _dim2) -REGISTER_TYPE_ID(framework::Dim<3>, _dim3) -REGISTER_TYPE_ID(framework::Dim<4>, _dim4) -REGISTER_TYPE_ID(framework::Dim<5>, _dim5) -REGISTER_TYPE_ID(framework::Dim<6>, _dim6) -REGISTER_TYPE_ID(framework::Dim<7>, _dim7) -REGISTER_TYPE_ID(framework::Dim<8>, _dim8) -REGISTER_TYPE_ID(framework::Dim<9>, _dim9) - -#ifdef PADDLE_MOBILE_CL -REGISTER_TYPE_ID(framework::CLImage, _cl_image) -#endif -} // namespace paddle_mobile - -namespace std { - -template <> -struct hash { - size_t operator()(const paddle_mobile::kTypeId_t &t) const { - return std::hash{}(static_cast(t)); - } -}; - -} // namespace std diff --git a/mobile/src/common/types.cpp b/mobile/src/common/types.cpp deleted file mode 100755 index 00a4369010..0000000000 --- a/mobile/src/common/types.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "common/types.h" -#include - -namespace paddle_mobile { - -const char *G_OP_TYPE_CONV = "conv2d"; -const char *G_OP_TYPE_BATCHNORM = "batch_norm"; -const char *G_OP_TYPE_INSTANCENORM = "instance_norm"; -const char *G_OP_TYPE_BOX_CODER = "box_coder"; -const char *G_OP_TYPE_CONCAT = "concat"; -const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; -const char *G_OP_TYPE_ELEMENTWISE_SUB = "elementwise_sub"; -const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul"; -const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant"; -const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; -const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu"; -const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu"; -const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; -const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu"; -const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; -const char *G_OP_TYPE_FUSION_CONV_RELU = "fusion_conv_relu"; -const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; -const char *G_OP_TYPE_FC = "fusion_fc"; -const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; -const char *G_OP_TYPE_LRN = "lrn"; -const char *G_OP_TYPE_MUL = "mul"; -const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms"; -const char *G_OP_TYPE_NORM = "norm"; -const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform"; -const char *G_OP_TYPE_POOL2D = "pool2d"; -const char *G_OP_TYPE_PRIOR_BOX = "prior_box"; -const char *G_OP_TYPE_DENSITY_PRIOR_BOX = "density_prior_box"; -const char *G_OP_TYPE_RELU = "relu"; -const char *G_OP_TYPE_RELU6 = "relu6"; -const char *G_OP_TYPE_LEAKY_RELU = "leaky_relu"; -const char *G_OP_TYPE_RESHAPE = "reshape"; -const char *G_OP_TYPE_RESHAPE2 = "reshape2"; -const char *G_OP_TYPE_SCALE = "scale"; -const char *G_OP_TYPE_SIGMOID = "sigmoid"; -const char *G_OP_TYPE_SOFTMAX = "softmax"; -const char *G_OP_TYPE_TRANSPOSE = "transpose"; -const char *G_OP_TYPE_TRANSPOSE2 = "transpose2"; -const char *G_OP_TYPE_SPLIT = "split"; -const char *G_OP_TYPE_FEED = "feed"; -const char *G_OP_TYPE_FETCH = "fetch"; -const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d"; -const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence"; -const char *G_OP_TYPE_DROPOUT = "dropout"; -const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn"; -const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn"; -const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU = - "fusion_elementwise_add_relu"; -const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu"; -const char *G_OP_TYPE_REGION = "region"; -const char *G_OP_TYPE_FUSION_CONV_BN = "fusion_conv_bn"; -const char *G_OP_TYPE_CONV_TRANSPOSE = "conv2d_transpose"; -const char *G_OP_TYPE_PRELU = "prelu"; -const char *G_OP_TYPE_LOOKUP_TABLE = "lookup_table"; -const char *G_OP_TYPE_GRU = "gru"; -const char *G_OP_TYPE_GRU_UNIT = "gru_unit"; -const char *G_OP_TYPE_CRF = "crf_decoding"; -const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp"; -const char *G_OP_TYPE_NEAREST_INTERP = "nearest_interp"; -const char *G_OP_TYPE_FLATTEN = "flatten"; -const char *G_OP_TYPE_FLATTEN2 = "flatten2"; -const char *G_OP_TYPE_SHAPE = "shape"; -const char *G_OP_TYPE_SUM = "sum"; -const char *G_OP_TYPE_TOP_K = "top_k"; -const char *G_OP_TYPE_CAST = "cast"; -const char *G_OP_TYPE_LOG = "log"; -const char *G_OP_TYPE_LOD_RESET = "lod_reset"; -const char *G_OP_TYPE_LESS_THAN = "less_than"; -const char *G_OP_TYPE_LOGICAL_AND = "logical_and"; -const char *G_OP_TYPE_LOGICAL_OR = "logical_or"; -const char *G_OP_TYPE_LOGICAL_NOT = "logical_not"; -const char *G_OP_TYPE_LOGICAL_XOR = "logical_xor"; -const char *G_OP_TYPE_WRITE_TO_ARRAY = "write_to_array"; -const char *G_OP_TYPE_READ_FROM_ARRAY = "read_from_array"; -const char *G_OP_TYPE_IS_EMPTY = "is_empty"; -const char *G_OP_TYPE_INCREMENT = "increment"; -const char *G_OP_TYPE_EXP = "exp"; - -const char *G_OP_TYPE_QUANTIZE = "quantize"; -const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; -const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn"; -const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn"; -const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu"; -const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu"; -const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT = - "fusion_dequant_add_bn_quant"; -const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT = - "fusion_dequant_add_bn_relu_quant"; - -const char *G_OP_TYPE_TANH = "tanh"; -const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; -const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add"; -const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu"; - -const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand"; -const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool"; -const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax"; -const char *G_OP_TYPE_SLICE = "slice"; -const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator"; -const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals"; -const char *G_OP_TYPE_PSROI_POOL = "psroi_pool"; -const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool"; -const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform"; -const char *G_OP_TYPE_PAD2D = "pad2d"; -const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu"; -const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn"; -const char *G_OP_TYPE_FUSION_DECONV_BN_RELU = "fusion_deconv_bn_relu"; -const char *G_OP_TYPE_ASSIGN = "assign"; -const char *G_OP_TYPE_REDUCE_PROD = "reduce_prod"; -const char *G_OP_TYPE_EQUAL = "equal"; -const char *G_OP_TYPE_CONDITIONAL_BLOCK = "conditional_block"; -const char *G_OP_TYPE_RANGE = "range"; -const char *G_OP_TYPE_WHILE = "while"; -const char *G_OP_TYPE_BEAM_SEARCH_DECODE = "beam_search_decode"; -const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE = - "fill_constant_batch_size_like"; -const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu"; -const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle"; -const char *G_OP_TYPE_EXPAND = "expand"; -const char *G_OP_TYPE_GRID_SAMPLER = "grid_sampler"; - -std::unordered_map< - std::string, std::pair, std::vector>> - op_input_output_key = { - {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}}, - {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_RELU, {{"X"}, {"Out"}}}, - {G_OP_TYPE_RELU6, {{"X"}, {"Out"}}}, - {G_OP_TYPE_LEAKY_RELU, {{"X"}, {"Out"}}}, - {G_OP_TYPE_SCALE, {{"X"}, {"Out"}}}, - {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}}, - {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}}, - {G_OP_TYPE_MUL, {{"X"}, {"Out"}}}, - {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_ELEMENTWISE_SUB, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}}, - {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}}, - {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Y"}}}, - {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}}, - {G_OP_TYPE_LRN, {{"X"}, {"Out"}}}, - {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}}, - {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}}, - {G_OP_TYPE_FEED, {{"X"}, {"Out"}}}, - {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}}, - {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}}, - {G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}}, - {G_OP_TYPE_BOX_CODER, - {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}}, - {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}}, - {G_OP_TYPE_DENSITY_PRIOR_BOX, - {{"Image", "Input"}, {"Boxes", "Variances"}}}, - {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}}, - {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}}, - {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}}, - {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}, - {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}}, - {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, - {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}, - {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}, - {G_OP_TYPE_EXP, {{"X"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}}, - {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Y"}}}, - {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}}, - {G_OP_TYPE_REGION, {{"X"}, {"Out"}}}, - {G_OP_TYPE_FUSION_CONV_BN, {{"Input"}, {"Y"}}}, - {G_OP_TYPE_LOOKUP_TABLE, {{"W", "Ids"}, {"Out"}}}, - {G_OP_TYPE_GRU, - {{"Input", "H0", "Weight", "Bias"}, - {"BatchGate", "BatchResetHiddenPrev", "BatchHidden", "Hidden"}}}, - {G_OP_TYPE_GRU_UNIT, - {{"Input", "HiddenPrev", "Weight", "Bias"}, - {"Gate", "ResetHiddenPrev", "Hidden"}}}, - {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}}, - {G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}}, - {G_OP_TYPE_NEAREST_INTERP, {{"OutSize", "X"}, {"Out"}}}, - {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}}, - {G_OP_TYPE_FLATTEN2, {{"X"}, {"Out"}}}, - {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}}, - {G_OP_TYPE_SUM, {{"X"}, {"Out"}}}, - {G_OP_TYPE_TOP_K, {{"X"}, {"Out", "Indices"}}}, - {G_OP_TYPE_CAST, {{"X"}, {"Out"}}}, - {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}}, - {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT, - {{"X", "Scale"}, {"Out", "OutScale"}}}, - {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT, - {{"X", "Scale"}, {"Out", "OutScale"}}}, - {G_OP_TYPE_TANH, {{"X"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DECONV_ADD_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_SEQUENCE_EXPAND, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_SEQUENCE_POOL, {{"X"}, {"Out"}}}, - {G_OP_TYPE_SEQUENCE_SOFTMAX, {{"X"}, {"Out"}}}, - {G_OP_TYPE_NORM, {{"X"}, {"Out", "Norm"}}}, - {G_OP_TYPE_LOG, {{"X"}, {"Out"}}}, - {G_OP_TYPE_LOD_RESET, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_LESS_THAN, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_LOGICAL_AND, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_LOGICAL_OR, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_LOGICAL_XOR, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_LOGICAL_NOT, {{"X"}, {"Out"}}}, - {G_OP_TYPE_WRITE_TO_ARRAY, {{"X", "I"}, {"Out"}}}, - {G_OP_TYPE_READ_FROM_ARRAY, {{"X", "I"}, {"Out"}}}, - {G_OP_TYPE_IS_EMPTY, {{"X"}, {"Out"}}}, - {G_OP_TYPE_INCREMENT, {{"X"}, {"Out"}}}, - {G_OP_TYPE_SLICE, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_ANCHOR_GENERATOR, {{"Input"}, {"Anchors", "Variances"}}}, - {G_OP_TYPE_GENERATE_PROPOSALS, - {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"}, - {"RpnRois", "RpnRoiProbs"}}}, - {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}}, - {G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}}, - {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_FUSION_DECONV_BN_RELU, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_REDUCE_PROD, {{"X"}, {"Out"}}}, - {G_OP_TYPE_ASSIGN, {{"X"}, {"Out"}}}, - {G_OP_TYPE_EQUAL, {{"X", "Y"}, {"Out"}}}, - {G_OP_TYPE_RANGE, {{"Start", "End", "Step"}, {"Out"}}}, - {G_OP_TYPE_CONDITIONAL_BLOCK, {{"Input", "Cond"}, {"Out", "Scope"}}}, - {G_OP_TYPE_WHILE, {{"Condition", "X"}, {"Out", "StepScopes"}}}, - {G_OP_TYPE_BEAM_SEARCH_DECODE, - {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}}, - {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}}, - {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}, - {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}}, - {G_OP_TYPE_EXPAND, {{"X"}, {"Out"}}}, - {G_OP_TYPE_GRID_SAMPLER, {{"X", "Grid"}, {"Output"}}}}; -} // namespace paddle_mobile diff --git a/mobile/src/common/types.h b/mobile/src/common/types.h deleted file mode 100644 index cc49182adb..0000000000 --- a/mobile/src/common/types.h +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include - -namespace paddle_mobile { -enum class Precision : int { FP32 = 0, FP16 = 1 }; - -typedef int16_t half; - -template -struct PrecisionTrait { - typedef void ptype; -}; - -template <> -struct PrecisionTrait { - typedef float ptype; -}; -template <> -struct PrecisionTrait { - typedef half ptype; -}; - -//! device type -enum DeviceTypeEnum { - kINVALID = -1, - kCPU = 0, - kFPGA = 1, - kGPU_MALI = 2, - kGPU_CL = 3 -}; - -template -struct DeviceType {}; - -typedef DeviceType CPU; -typedef DeviceType FPGA; -typedef DeviceType GPU_CL; - -//! data type -enum DataType { - PM_INVALID = -1, - PM_HALF = 0, - PM_FLOAT = 1, - PM_DOUBLE = 2, - PM_INT8 = 3, - PM_INT16 = 4, - PM_INT32 = 5, - PM_INT64 = 6, - PM_UINT8 = 7, - PM_UINT16 = 8, - PM_UINT32 = 9, - PM_STRING = 10, - PM_BOOL = 11, - PM_SHAPE = 12, - PM_TENSOR = 13 -}; -//! -enum PMStatus { - PMSuccess = 0xFF, /*!< No errors */ - PMNotInitialized = 0x01, /*!< Data not initialized. */ - PMInvalidValue = 0x02, /*!< Incorrect variable value. */ - PMMemAllocFailed = 0x03, /*!< Memory allocation error. */ - PMUnKownError = 0x04, /*!< Unknown error. */ - PMOutOfAuthority = 0x05, /*!< Try to modified data not your own*/ - PMOutOfMem = 0x06, /*!< OOM error*/ - PMUnImplError = 0x07, /*!< Unimplement error. */ - PMWrongDevice = 0x08, /*!< un-correct device. */ - PMException = 0x09 /*!< throw exception. */ -}; - -enum PrePostType { - NONE_PRE_POST = 0, - UINT8_255 = 1, -}; - -enum RoundType { - ROUND_NEAREST_AWAY_ZERO = 0, - ROUND_NEAREST_TOWARDS_ZERO = 1, - ROUND_NEAREST_TO_EVEN = 2, -}; - -enum ActivationType { - IDENTITY = 0, - RELU = 1, - RELU6 = 2, - PRELU = 3, - LEAKY_RELU = 4, - TANH = 5, - SIGMOID = 6, - LOG = 7, -}; - -enum PoolingType { - MAX = 0, - AVG = 1, - SUM = 2, - FIRST = 3, - LAST = 4, -}; - -enum PowerMode { - PERFORMANCE_PRIORITY = 0, // let threads run on big cores if - // thread_num <= big_cores_num, - // otherwise the power mode will be - // set to AUTO and all threads are - // scheduled by system - EFFICIENCY_PRIORITY = 1, // let threads run on little cores if - // thread_num <= little_cores_num, - // otherwise the power mode will be - // set to AUTO and all threads are - // scheduled by system - PERFORMANCE_ONLY = 2, // force threads run on big cores, - // and the remains are ignored if - // exceed the number big cores - EFFICIENCY_ONLY = 3, // force threads run on little cores, - // and the remains are ignored if - // exceed the number of little cores - AUTO = 4, // scheduled by system -}; - -enum MemoryOptimizationLevel { - NoMemoryOptimization = 0, - MemoryOptimizationWithoutFeeds = 1, - FullMemoryOptimization = 2, -}; - -struct PaddleMobileConfigInternal { - bool load_when_predict = false; - MemoryOptimizationLevel memory_optimization_level = - MemoryOptimizationWithoutFeeds; - std::string model_obfuscate_key = ""; - PrePostType pre_post_type = NONE_PRE_POST; -}; - -enum ARMArch { - APPLE = 0, - A53 = 53, - A55 = 55, - A57 = 57, - A72 = 72, - A73 = 73, - A75 = 75, - A76 = 76, - ARM_UNKOWN = -1 -}; - -extern const char *G_OP_TYPE_CONV; -extern const char *G_OP_TYPE_BATCHNORM; -extern const char *G_OP_TYPE_INSTANCENORM; -extern const char *G_OP_TYPE_BOX_CODER; -extern const char *G_OP_TYPE_CONCAT; -extern const char *G_OP_TYPE_ELEMENTWISE_ADD; -extern const char *G_OP_TYPE_ELEMENTWISE_SUB; -extern const char *G_OP_TYPE_ELEMENTWISE_MUL; -extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU; -extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU; -extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; -extern const char *G_OP_TYPE_FC; -extern const char *G_OP_TYPE_FUSION_CONV_ADD; -extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; -extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU; -extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU; -extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU; -extern const char *G_OP_TYPE_FUSION_CONV_RELU; - -extern const char *G_OP_TYPE_GRU; -extern const char *G_OP_TYPE_GRU_UNIT; -extern const char *G_OP_TYPE_CRF; -extern const char *G_OP_TYPE_BILINEAR_INTERP; -extern const char *G_OP_TYPE_NEAREST_INTERP; -extern const char *G_OP_TYPE_FLATTEN; -extern const char *G_OP_TYPE_FLATTEN2; -extern const char *G_OP_TYPE_SHAPE; -extern const char *G_OP_TYPE_LRN; -extern const char *G_OP_TYPE_MUL; -extern const char *G_OP_TYPE_MULTICLASS_NMS; -extern const char *G_OP_TYPE_NORM; -extern const char *G_OP_TYPE_POOL2D; -extern const char *G_OP_TYPE_PRIOR_BOX; -extern const char *G_OP_TYPE_RELU; -extern const char *G_OP_TYPE_RELU6; -extern const char *G_OP_TYPE_LEAKY_RELU; -extern const char *G_OP_TYPE_RESHAPE; -extern const char *G_OP_TYPE_SCALE; -extern const char *G_OP_TYPE_SIGMOID; -extern const char *G_OP_TYPE_SOFTMAX; -extern const char *G_OP_TYPE_TRANSPOSE; -extern const char *G_OP_TYPE_SPLIT; -extern const char *G_OP_TYPE_FEED; -extern const char *G_OP_TYPE_FETCH; -extern const char *G_OP_TYPE_DEPTHWISE_CONV; -extern const char *G_OP_TYPE_IM2SEQUENCE; -extern const char *G_OP_TYPE_DROPOUT; - -extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN; -extern const char *G_OP_TYPE_FUSION_POOL_BN; -extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; -extern const char *G_OP_TYPE_FUSION_FC_RELU; -extern const char *G_OP_TYPE_REGION; -extern const char *G_OP_TYPE_FUSION_CONV_BN; -extern const char *G_OP_TYPE_CONV_TRANSPOSE; -extern const char *G_OP_TYPE_PRELU; -extern const char *G_OP_TYPE_SUM; -extern const char *G_OP_TYPE_TOP_K; -extern const char *G_OP_TYPE_CAST; -extern const char *G_OP_TYPE_LOG; -extern const char *G_OP_TYPE_LOD_RESET; -extern const char *G_OP_TYPE_LESS_THAN; -extern const char *G_OP_TYPE_LOGICAL_AND; -extern const char *G_OP_TYPE_LOGICAL_OR; -extern const char *G_OP_TYPE_LOGICAL_NOT; -extern const char *G_OP_TYPE_LOGICAL_XOR; -extern const char *G_OP_TYPE_WRITE_TO_ARRAY; -extern const char *G_OP_TYPE_READ_FROM_ARRAY; -extern const char *G_OP_TYPE_IS_EMPTY; -extern const char *G_OP_TYPE_INCREMENT; - -extern const char *G_OP_TYPE_QUANTIZE; -extern const char *G_OP_TYPE_DEQUANTIZE; -extern const char *G_OP_TYPE_FUSION_DEQUANT_BN; -extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN; -extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU; -extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU; -extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT; -extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT; - -extern const char *G_OP_TYPE_TANH; -extern const char *G_OP_TYPE_FUSION_DECONV_RELU; - -extern const char *G_OP_TYPE_FUSION_DECONV_ADD; -extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU; - -extern const char *G_OP_TYPE_SEQUENCE_EXPAND; -extern const char *G_OP_TYPE_SEQUENCE_POOL; -extern const char *G_OP_TYPE_SEQUENCE_SOFTMAX; - -extern const char *G_OP_TYPE_SLICE; -extern const char *G_OP_TYPE_ANCHOR_GENERATOR; -extern const char *G_OP_TYPE_GENERATE_PROPOSALS; -extern const char *G_OP_TYPE_PSROI_POOL; -extern const char *G_OP_TYPE_ROIALIGN_POOL; -extern const char *G_OP_TYPE_ROI_PERSPECTIVE; -extern const char *G_OP_TYPE_PAD2D; -extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; -extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN; -extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU; -extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU; -extern const char *G_OP_TYPE_PIXEL_SHUFFLE; -extern const char *G_OP_TYPE_EXPAND; -extern const char *G_OP_TYPE_GRID_SAMPLER; - -extern std::unordered_map< - std::string, std::pair, std::vector>> - op_input_output_key; - -typedef std::map> VariableNameMap; - -} // namespace paddle_mobile diff --git a/mobile/src/common/util.cpp b/mobile/src/common/util.cpp deleted file mode 100644 index acdc42e879..0000000000 --- a/mobile/src/common/util.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "common/util.h" - -namespace paddle_mobile { - -char *ReadFileToBuff(std::string filename) { - FILE *file = fopen(filename.c_str(), "rb"); - PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", - filename.c_str()); - fseek(file, 0, SEEK_END); - int64_t size = ftell(file); - PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty"); - rewind(file); - char *data = new char[size]; - size_t bytes_read = fread(data, 1, size, file); - PADDLE_MOBILE_ENFORCE(bytes_read == size, - "read binary file bytes do not match with fseek"); - fclose(file); - return data; -} - -int GetFileLength(std::string filename) { - FILE *file = fopen(filename.c_str(), "rb"); - PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", - filename.c_str()); - fseek(file, 0, SEEK_END); - int size = ftell(file); - PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty"); - fclose(file); - return size; -} - -} // namespace paddle_mobile diff --git a/mobile/src/common/util.h b/mobile/src/common/util.h deleted file mode 100644 index 212362a52e..0000000000 --- a/mobile/src/common/util.h +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "common/enforce.h" - -namespace paddle_mobile { - -char *ReadFileToBuff(std::string filename); - -int GetFileLength(std::string filename); - -} // namespace paddle_mobile diff --git a/mobile/src/common/variant.h b/mobile/src/common/variant.h deleted file mode 100644 index 63795468ff..0000000000 --- a/mobile/src/common/variant.h +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include "common/enforce.h" -#include "common/log.h" -#include "common/type_define.h" - -namespace paddle_mobile { - -template -struct IDToType { - typedef Type type_t; -}; - -template -struct VariantHelper { - inline static void Destroy(kTypeId_t type, void *raw_ptr) { - if (type == type_id()) { - auto ptr = reinterpret_cast(raw_ptr); - delete ptr; - } else { - VariantHelper::Destroy(type, raw_ptr); - } - } -}; - -template -struct VariantHelper { - inline static void Destroy(kTypeId_t type, void *raw_ptr) { - if (type == type_id()) { - auto ptr = reinterpret_cast(raw_ptr); - delete ptr; - } - } -}; - -template -struct VariantDeleter { - kTypeId_t type_ = type_id().hash_code(); - explicit VariantDeleter(kTypeId_t type) { type_ = type; } - void operator()(void *raw_ptr) { - // DLOG << "variant delete: " << type_ << " " << raw_ptr; - VariantHelper::Destroy(type_, raw_ptr); - } -}; - -template -struct Variant { - Variant() : type_(invalid_type()) {} - - Variant(const Variant &variant) { - type_ = variant.type_; - data_ = variant.data_; - } - - virtual ~Variant() { - // DLOG << "variant deinit: " << type_ << " " << (void *)data_.get(); - data_.reset(); - } - - template - void Set(Args &&... args) { - auto raw_ptr = new T(std::forward(args)...); - type_ = type_id().hash_code(); - // DLOG << "variant new: " << type_ << " " << (void *)raw_ptr; - data_.reset(raw_ptr, VariantDeleter(type_)); - } - - template - T &Get() const { - return *const_cast(reinterpret_cast(data_.get())); - } - - kTypeId_t TypeId() const { return type_; } - - private: - static inline kTypeId_t invalid_type() { return type_id().hash_code(); } - typedef VariantHelper helper; - kTypeId_t type_ = type_id().hash_code(); - std::shared_ptr data_; -}; - -template -struct Vistor { - typedef T type_t; -}; - -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/alignment.h b/mobile/src/fpga/KD/alignment.h deleted file mode 100644 index 4df852f5fd..0000000000 --- a/mobile/src/fpga/KD/alignment.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef alignment_h -#define alignment_h - -#include - -#include "llapi/zynqmp_api.h" - -namespace paddle_mobile { -namespace zynqmp { - -inline int align_image(int wc) { return align_to_x(wc, IMAGE_ALIGNMENT); } - -} // namespace zynqmp -} // namespace paddle_mobile - -#endif /* alignment_h */ diff --git a/mobile/src/fpga/KD/context.hpp b/mobile/src/fpga/KD/context.hpp deleted file mode 100644 index e7c106ff8c..0000000000 --- a/mobile/src/fpga/KD/context.hpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef Context_hpp -#define Context_hpp - -#include -#include "pe.hpp" -#include "pes/conv_pe.hpp" -#include "pes/depthwise_conv_pe.hpp" -#include "pes/fully_connected_pe.hpp" -#include "pes/input_pe.hpp" -#include "pes/output_pe.hpp" -#include "pes/pooling_pe.hpp" -#include "pes/softmax_pe.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -class Context { - public: - template - Ptype& pe() { - if (pe_ == nullptr) { - pe_ = new Ptype(); - } - return static_cast(*pe_); - } - - ~Context() { - if (pe_ != nullptr) { - delete pe_; - } - } - - private: - PE* pe_ = nullptr; -}; -} // namespace zynqmp -} // namespace paddle_mobile - -#endif /* Context_hpp */ diff --git a/mobile/src/fpga/KD/dl_engine.cpp b/mobile/src/fpga/KD/dl_engine.cpp deleted file mode 100644 index a8923fd6c5..0000000000 --- a/mobile/src/fpga/KD/dl_engine.cpp +++ /dev/null @@ -1,15 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "dl_engine.hpp" diff --git a/mobile/src/fpga/KD/dl_engine.hpp b/mobile/src/fpga/KD/dl_engine.hpp deleted file mode 100644 index 861d7231dc..0000000000 --- a/mobile/src/fpga/KD/dl_engine.hpp +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -namespace paddle_mobile { -namespace zynqmp { - -class DLEngine { - public: - static DLEngine& get_instance() { - static DLEngine s_instance; - return s_instance; - } - - private: - DLEngine(); -}; -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/float16.hpp b/mobile/src/fpga/KD/float16.hpp deleted file mode 100644 index f3d5c6637b..0000000000 --- a/mobile/src/fpga/KD/float16.hpp +++ /dev/null @@ -1,506 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace zynqmp { - -typedef uint16_t float16; - -static const uint32_t mantissatable[2048] = { - 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, - 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, - 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, - 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, - 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, - 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, - 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, - 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, - 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, - 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, - 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, - 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, - 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, - 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, - 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, - 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, - 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, - 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, - 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, - 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, - 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, - 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, - 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, - 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, - 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, - 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, - 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, - 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, - 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, - 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, - 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, - 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, - 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, - 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, - 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, - 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, - 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, - 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, - 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, - 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, - 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, - 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, - 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, - 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, - 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, - 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, - 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, - 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, - 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, - 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, - 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, - 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, - 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, - 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, - 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, - 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, - 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, - 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, - 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, - 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, - 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, - 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, - 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, - 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, - 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, - 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, - 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, - 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, - 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, - 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, - 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, - 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, - 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, - 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, - 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, - 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, - 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, - 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, - 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, - 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, - 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, - 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, - 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, - 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, - 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, - 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, - 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, - 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, - 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, - 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, - 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, - 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, - 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, - 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, - 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, - 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, - 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, - 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, - 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, - 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, - 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, - 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, - 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, - 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, - 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, - 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, - 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, - 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, - 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, - 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, - 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, - 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, - 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, - 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, - 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, - 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, - 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, - 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, - 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, - 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, - 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, - 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, - 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, - 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, - 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, - 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, - 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, - 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, - 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, - 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, - 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, - 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, - 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, - 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, - 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, - 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, - 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, - 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, - 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, - 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, - 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, - 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, - 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, - 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, - 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, - 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, - 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, - 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, - 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, - 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, - 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, - 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, - 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, - 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, - 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, - 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, - 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, - 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, - 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, - 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, - 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, - 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, - 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, - 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, - 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, - 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, - 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, - 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, - 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, - 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, - 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, - 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, - 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, - 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, - 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, - 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, - 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, - 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, - 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, - 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, - 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, - 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, - 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, - 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, - 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, - 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, - 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, - 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, - 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, - 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, - 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, - 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, - 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, - 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, - 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, - 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, - 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, - 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, - 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, - 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, - 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, - 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, - 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, - 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, - 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, - 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, - 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, - 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, - 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, - 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, - 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, - 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, - 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, - 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, - 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, - 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, - 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, - 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, - 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, - 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, - 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, - 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, - 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, - 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, - 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, - 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, - 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, - 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, - 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, - 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, - 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, - 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, - 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, - 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, - 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, - 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, - 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, - 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, - 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, - 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, - 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, - 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, - 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, - 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, - 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, - 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, - 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, - 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, - 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, - 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, - 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, - 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, - 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, - 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, - 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, - 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, - 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, - 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, - 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, - 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, - 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, - 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, - 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, - 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, - 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, - 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, - 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, - 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, - 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, - 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, - 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, - 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, - 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, - 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, - 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, - 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, - 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, - 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, - 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, - 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, - 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, - 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, - 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, - 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, - 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, - 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, - 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, - 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, - 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, - 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, - 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, - 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, - 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, - 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, - 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, - 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, - 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, - 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, - 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, - 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, - 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, - 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, - 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, - 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, - 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, - 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, - 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, - 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, - 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, - 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, - 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, - 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, - 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, - 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, - 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, - 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, - 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, - 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, - 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, - 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, - 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, - 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, - 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, - 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, - 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, - 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, - 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, - 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, - 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, - 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, - 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, - 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, - 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, - 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, - 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, - 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, - 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, - 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, - 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, - 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, - 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, - 0x387fc000, 0x387fe000}; - -static const uint16_t offsettable[64] = { - 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; - -static const uint32_t exponenttable[64] = { - 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, - 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, - 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, - 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, - 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, - 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, - 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, - 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, - 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, - 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, - 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; - -static const uint16_t basetable[512] = { - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, - 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, - 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, - 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, - 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, - 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, - 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, - 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, - 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, - 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; - -static const uint8_t shifttable[512] = { - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, - 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, - 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; - -inline float16 float_to_half(float f) { - uint32_t v = *reinterpret_cast(&f); - return basetable[(v >> 23) & 0x1ff] + - ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); -} - -inline float half_to_float(float16 h) { - uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + - exponenttable[h >> 10]; - return *reinterpret_cast(&v); -} - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/layout.hpp b/mobile/src/fpga/KD/layout.hpp deleted file mode 100644 index 8df0d11d3b..0000000000 --- a/mobile/src/fpga/KD/layout.hpp +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "fpga/KD/alignment.h" - -namespace paddle_mobile { -namespace zynqmp { - -enum LayoutType { - N, - NC, - NCHW, - NHWC, - NHW, -}; - -class Layout { - public: - virtual int numIndex() = 0; - virtual int channelIndex() { return -1; } - virtual int heightIndex() { return -1; } - virtual int widthIndex() { return -1; } - virtual int alignedElementCount(const std::vector& dims) = 0; - virtual int elementCount(const std::vector& dims) = 0; -}; - -struct NCHW : Layout { - int numIndex() { return 0; } - int channelIndex() { return 1; } - int heightIndex() { return 2; } - int widthIndex() { return 3; } - int alignedElementCount(const std::vector& dims) { - return dims[0] * dims[2] * align_image(dims[1] * dims[3]); - } - virtual int elementCount(const std::vector& dims) { - return dims[0] * dims[1] * dims[2] * dims[3]; - } -}; - -struct NHWC : Layout { - int numIndex() { return 0; } - int heightIndex() { return 1; } - int widthIndex() { return 2; } - int channelIndex() { return 3; } - int alignedElementCount(const std::vector& dims) { - return dims[0] * dims[1] * align_image(dims[2] * dims[3]); - } - virtual int elementCount(const std::vector& dims) { - return dims[0] * dims[1] * dims[2] * dims[3]; - } -}; - -struct NC : Layout { - int numIndex() { return 0; } - int channelIndex() { return 1; } - int alignedElementCount(const std::vector& dims) { - return dims[0] * dims[1]; - } - virtual int elementCount(const std::vector& dims) { - return dims[0] * dims[1]; - } -}; - -struct N : Layout { - int numIndex() { return 0; } - int alignedElementCount(const std::vector& dims) { return dims[0]; } - virtual int elementCount(const std::vector& dims) { return dims[0]; } -}; - -struct NHW : Layout { - int numIndex() { return 0; } - int heightIndex() { return 1; } - int widthIndex() { return 2; } - int alignedElementCount(const std::vector& dims) { - // TODO(chonwhite) align it; - return dims[0] * dims[1] * dims[2]; - } - virtual int elementCount(const std::vector& dims) { - return dims[0] * dims[1] * dims[2]; - } -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/llapi/bias_scale.cpp b/mobile/src/fpga/KD/llapi/bias_scale.cpp deleted file mode 100644 index 612c86871c..0000000000 --- a/mobile/src/fpga/KD/llapi/bias_scale.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "fpga/KD/llapi/bias_scale.h" -#include "fpga/KD/llapi/zynqmp_api.h" - -namespace paddle_mobile { -namespace zynqmp { -namespace bias_scale { - -void align_element(float **data_in, int num_per_div_before_alignment, int num) { - int copynum = 0; - float *ptr_unaligned = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); - int num_element = - 2 * div_num * num_per_div_after_alignment; // including bias & scale - float *ptr_aligned = - (float *)fpga_malloc(num_element * sizeof(float)); // NOLINT - - memset(ptr_aligned, 0, num_element * sizeof(float)); - for (int i = 0; i < div_num; i++) { - if (i == div_num - 1) { - copynum = (num_per_div_after_alignment * div_num > num) - ? (num % num_per_div_after_alignment) - : (num_per_div_before_alignment); - } else { - copynum = num_per_div_before_alignment; - } - - memcpy(ptr_aligned + i * num_per_div_after_alignment, - ptr_unaligned + num_per_div_before_alignment * i, - copynum * sizeof(float)); - memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment, - ptr_unaligned + num_per_div_before_alignment * i + num, - copynum * sizeof(float)); - } - fpga_free(ptr_unaligned); - *data_in = ptr_aligned; -} - -void interleave(float **data_in, int num_after_alignment) { - float *ptr_uninterleaved = *data_in; - float *ptr_interleaved = - (float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT - int num = num_after_alignment / 4; - for (int i = 0; i < num; i++) { - memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, - 4 * sizeof(float)); - memcpy(ptr_interleaved + 8 * i + 4, - ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float)); - } - - fpga_free(ptr_uninterleaved); - *data_in = ptr_interleaved; -} - -void format_bias_scale_array(float **bias_scale_array, - int element_num_per_division, int num) { - align_element(bias_scale_array, element_num_per_division, num); - int div_num = (num + element_num_per_division - 1) / element_num_per_division; - int element_num_after_division = - align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); - interleave(bias_scale_array, div_num * element_num_after_division); - fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float)); -} -void format_bias_array(float **bias_array, int num) { - float *ptr_unaligned = *bias_array; - int num_before_align = num; - int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT); - int16_t *ptr_aligned = - (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT - - memset(ptr_aligned, 0, num_after_align * sizeof(int16_t)); - for (int i = 0; i < num_before_align; i++) { - float value = ptr_aligned[i]; - ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]); - } - *bias_array = (float *)ptr_aligned; // NOLINT - fpga_free(ptr_unaligned); -} - -} // namespace bias_scale -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/llapi/bias_scale.h b/mobile/src/fpga/KD/llapi/bias_scale.h deleted file mode 100644 index 66f05cc647..0000000000 --- a/mobile/src/fpga/KD/llapi/bias_scale.h +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace zynqmp { -namespace bias_scale { - -void align_element(float** data_in, int num_per_div_before_alignment, int num); -void interleave(float** data_in, int num_after_alignment); -void format_bias_scale_array(float** bias_scale_array, - int element_num_per_division, int num); -void format_bias_array(float** bias_array, int num); - -} // namespace bias_scale -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/llapi/config.h b/mobile/src/fpga/KD/llapi/config.h deleted file mode 100755 index be919489fb..0000000000 --- a/mobile/src/fpga/KD/llapi/config.h +++ /dev/null @@ -1,19 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#define PADDLE_MOBILE_ZU5 -#define FPGA_PRINT_MODE -#define PADDLE_MOBILE_PROFILE diff --git a/mobile/src/fpga/KD/llapi/filter.cpp b/mobile/src/fpga/KD/llapi/filter.cpp deleted file mode 100644 index f9e5717e32..0000000000 --- a/mobile/src/fpga/KD/llapi/filter.cpp +++ /dev/null @@ -1,346 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/KD/llapi/filter.h" -#include -#include -#include "fpga/KD/float16.hpp" -#include "fpga/KD/llapi/zynqmp_api.h" - -namespace paddle_mobile { -namespace zynqmp { -namespace filter { - -int calc_division_capacity(int chw) { - int n = 2048 / ((chw + 15) / 16) * 32; - return n < 2048 ? n : 2048; -} - -int calc_split_num(int num, int division_capacity) { - return (num + division_capacity - 1) / division_capacity; -} - -int calc_division_number(int num, int group_num, int division_capacity) { - int split_num = calc_split_num(num, division_capacity); - // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - // "Split number or group number should be 1"); - return group_num * split_num; -} - -int calc_num_per_div(int num, int group_num, int division_capacity) { - if (group_num == 1) { - if (num > division_capacity) { - return division_capacity; - } else { - return num; - } - } else { - return (num + group_num - 1) / group_num; - } -} - -void convert_to_hwc(char **data_in, int num, int channel, int height, - int width) { - char *tmp = *data_in; - int chw = channel * height * width; - char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT - for (int n = 0; n < num; n++) { - int64_t amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + n * chw + offset_height + w * channel + c) = - *((*data_in)++); - } - } - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -float find_max(float *data_in, int data_size) { - float max = 0.0; - for (int i = 0; i < data_size; ++i) { - float value = data_in[i]; - float abs = value > 0 ? value : -value; - max = std::max(max, abs); - } - return max; -} - -signed char float_to_int8(float fdata) { - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - return (signed char)fdata; -} - -void quantize(float **data_in, int data_size, float max) { - float *tmp = *data_in; - float fix_range = 127; - float scale = fix_range / max; - - signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = float_to_int8( - (*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale); - } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} - -void align_element(char **data_in, int num, int chw) { - int j = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - if (align_chw != chw) { - char *tmp = *data_in; - char *data_tmp = - (char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num * align_chw); - for (j = 0; j < num; j++) { - memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw); - } - *data_in = data_tmp; - fpga_free(tmp); - } -} - -void align_num(char **data_in, int num_per_div_before_alignment, int num, - int chw) { - int i = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - - char *tmp = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_element = div_num * num_per_div_after_alignment * align_chw; - char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(char)); - - for (i = 0; i < div_num - 1; i++) { - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - num_per_div_before_alignment * align_chw); - } - - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - (num - (div_num - 1) * num_per_div_before_alignment) * align_chw); - - *data_in = data_tmp; - fpga_free(tmp); -} - -void reorder(char **data_in, int num_after_alignment, int chw) { - int index = 0; - int new_index = 0; - - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; - for (index = 0; index < num_after_alignment; index++) { - new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) + - (index / 16 % 2 * 4); - memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align, - chw_align); - } - *data_in = data_tmp; - fpga_free(tmp); -} - -size_t interleave(char **data_in, int num_after_alignment, int chw) { - int i = 0; - int j = 0; - int k = 0; - int interleave_per_num = 16; - - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - std::cout << "interleave size:" << chw_align * num_after_alignment - << std::endl; - char *tmp = *data_in; - int interleave_num = chw_align * 2 / interleave_per_num; - for (i = 0; i < num_after_alignment; i += 2) { - for (j = 0, k = 0; j < interleave_num; j += 2, k++) { - memcpy(data_tmp + i * chw_align + interleave_per_num * j, - *data_in + i * chw_align + interleave_per_num * k, - interleave_per_num); - memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1), - *data_in + (i + 1) * chw_align + interleave_per_num * k, - interleave_per_num); - } - } - *data_in = data_tmp; - fpga_free(tmp); - return chw_align * num_after_alignment; -} - -size_t format_filter(float **data_in, int num, int channel, int height, - int width, int group_num, float max) { - int data_size = channel * height * width * num; - int chw = channel * height * width; - - int division_capacity = calc_division_capacity(chw); - int num_per_div_before_alignment = - calc_num_per_div(num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int residual = num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_to_hwc(quantize_data, num, channel, height, width); - align_element(quantize_data, num, chw); - if (num_after_alignment != num) { - align_num(quantize_data, num_per_div_before_alignment, num, chw); - } - - reorder(quantize_data, num_after_alignment, chw); - size_t mem_size = interleave(quantize_data, num_after_alignment, chw); - fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * - num_after_alignment * sizeof(char)); - return mem_size; -} - -void convert_fc_filter(char **data_in, int num, int chw) { - char *tmp = *data_in; - char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT - for (int n = 0; n < num; n++) { - for (int c = 0; c < chw; c++) { - data_tmp[n * chw + c] = (*data_in)[num * c + n]; - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void format_fc_filter(float **data_in, int num, int channel, int height, - int width, int group_num, float max) { - int data_size = channel * height * width * num; - int chw = channel * height * width; - - int division_capacity = calc_division_capacity(chw); - int num_per_div_before_alignment = - calc_num_per_div(num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int residual = num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - - quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_fc_filter(quantize_data, num, chw); - align_element(quantize_data, num, chw); - if (num_after_alignment != num) { - align_num(quantize_data, num_per_div_before_alignment, num, chw); - } - reorder(quantize_data, num_after_alignment, chw); - interleave(quantize_data, num_after_alignment, chw); - fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * - num_after_alignment * sizeof(char)); -} -void convert_to_hwn(int16_t **data_in, int num, int height, int width) { - int16_t *tmp = *data_in; - int16_t *data_tmp = - (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - *(data_tmp + h * width * num + w * num + n) = *((*data_in)++); - } - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void align_element_n(int16_t **data_in, int num, int height, int width) { - int unalign_n = num; - int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT); - if (unalign_n == align_n) { - return; - } else { - int16_t *tmp = *data_in; - - int num_element = height * width * align_n; - int16_t *data_tmp = - (int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(int16_t)); - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int offset_unalign = h * width * unalign_n + w * unalign_n; - int offset_align = h * width * align_n + w * align_n; - for (int n = 0; n < unalign_n; n++) { - data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n); - } - } - } - *data_in = data_tmp; - free(tmp); - } -} -void quantize_to_fp16(float **data_in, int num, int height, int width, - float *scale_ptr) { - float *tmp = *data_in; - int size = num * height * width; - - float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16)); // NOLINT - for (int n = 0; n < num; n++) { - float scale_val = scale_ptr[n]; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int index = n * height * width + h * width + w; - float value = tmp[index] * scale_val; - tmp_data[index] = float_to_half(value); - } - } - } - fpga_flush(tmp_data, size * sizeof(int16_t)); - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} -void format_dwconv_filter(float **data_in, int num, int height, int width, - float *scale_ptr) { - quantize_to_fp16(data_in, num, height, width, scale_ptr); - int16_t **quantize_data = (int16_t **)data_in; // NOLINT - convert_to_hwn(quantize_data, num, height, width); - align_element_n(quantize_data, num, height, width); - fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * - height * width * sizeof(int16_t)); -} -} // namespace filter -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/llapi/filter.h b/mobile/src/fpga/KD/llapi/filter.h deleted file mode 100644 index 80c027a104..0000000000 --- a/mobile/src/fpga/KD/llapi/filter.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -namespace paddle_mobile { -namespace zynqmp { -namespace filter { - -int calc_division_capacity(int chw); -int calc_split_num(int num, int division_capacity); -int calc_division_number(int num, int group_num, int division_capacity); -int calc_num_per_div(int num, int group_num, int division_capacity); -void convert_to_hwc(char** data_in, int num, int channel, int height, - int width); -float find_max(float* data_in, int data_size); -void quantize(float** data_in, int data_size, float max); -void align_element(char** data_in, int num, int chw); -void align_num(char** data_in, int num_per_div_before_alignment, int num, - int chw); -void reorder(char** data_in, int num_after_alignment, int chw); -size_t interleave(char** data_in, int num_after_alignment, int chw); -size_t format_filter(float** data_in, int num, int channel, int height, - int width, int group_num, float max); - -void convert_fc_filter(char** data_in, int num, int chw); -void format_fc_filter(float** data_in, int num, int channel, int height, - int width, int group_num, float max); - -void convert_to_hwn(int16_t** data_in, int num, int height, int width); -void align_element_n(int16_t** data_in, int num, int height, int width); -void quantize_to_fp16(float** data_in, int num, int height, int width, - float* scale_ptr); -void format_dwconv_filter(float** data_in, int num, int height, int width, - float* scale_ptr); - -} // namespace filter -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/llapi/image.cpp b/mobile/src/fpga/KD/llapi/image.cpp deleted file mode 100644 index d44d25420a..0000000000 --- a/mobile/src/fpga/KD/llapi/image.cpp +++ /dev/null @@ -1,149 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "fpga/KD/llapi/image.h" -#include "fpga/KD/llapi/zynqmp_api.h" - -namespace paddle_mobile { -namespace zynqmp { -namespace image { - -void convert_to_hwc(float **data_in, int channel, int height, int width) { - float *tmp = *data_in; - float *data_tmp = - (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT - int64_t amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + offset_height + w * channel + c) = *((*data_in)++); - } - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void align_element_conv(float **data_in, int height, int cw) { - int h = 0; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - if (align_cw != cw) { - float *tmp = *data_in; - float *data_tmp = - (float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT - - memset(data_tmp, 0, height * align_cw * sizeof(float)); - - for (h = 0; h < height; h++) { - memcpy((void *)(data_tmp + h * align_cw), // NOLINT - (void *)(*data_in + h * cw), // NOLINT - cw * sizeof(float)); - } - *data_in = data_tmp; - fpga_free(tmp); - } -} - -void format_image(float **data_in, int channel, int height, int width) { - // convert_to_hwc(data_in, channel, height, width); - align_element_conv(data_in, height, channel * width); - fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height * - sizeof(float)); -} - -void concat_images(int16_t **images_in, float **scales_in, void *image_out, - float *scale_out, int image_num, uint32_t *channel_num, - int height, int width) { - int i = 0; - int j = 0; - int k = 0; - int each_out_line_channel = 0; - int align_each_out_area_cw = 0; - int align_each_in_area_cw = 0; - int align_each_out_area_cw_differ = 0; - int tmp_channel = 0; - scale_out[0] = 0.0; - scale_out[1] = 0.0; - for (i = 0; i < image_num; i++) { - each_out_line_channel += channel_num[i]; - scale_out[0] = std::max(*scale_out, scales_in[i][0]); - // fpga_invalidate(images_in[i], - // height * - // align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * - // sizeof(int16_t)); - } - scale_out[1] = 1 / scale_out[0]; - align_each_out_area_cw = - align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); - align_each_out_area_cw_differ = - align_each_out_area_cw - each_out_line_channel * width; - - for (k = 0; k < height; k++) { - for (j = 0; j < width; j++) { - for (i = 0; i < image_num; i++) { - align_each_in_area_cw = - align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); - memcpy((int16_t *)image_out + tmp_channel + // NOLINT - k * align_each_out_area_cw_differ, - images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, - channel_num[i] * sizeof(int16_t)); - - tmp_channel += channel_num[i]; - } - } - } - fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); -} - -void split_image(int16_t *image_in, const float *scale_in, void **images_out, - float **scales_out, int image_num, - const uint32_t *channel_nums, int height, int width) { - int total_channel = 0; - for (int i = 0; i < image_num; i++) { - scales_out[i][0] = scale_in[0]; - scales_out[i][1] = scale_in[1]; - total_channel += channel_nums[i]; - } - int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT); - fpga_invalidate(image_in, element_num * sizeof(int16_t)); - - int src_offset = 0; - int des_offset = 0; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) + - w * total_channel; - for (int i = 0; i < image_num; i++) { - des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + - w * channel_nums[i]; - memcpy(reinterpret_cast(images_out[i] + des_offset), - image_in + src_offset, channel_nums[i] * sizeof(int16_t)); - src_offset += channel_nums[i]; - } - } - } - - for (int i = 0; i < image_num; i++) { - element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT); - fpga_flush(images_out[i], element_num * sizeof(int16_t)); - } -} - -} // namespace image -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/llapi/image.h b/mobile/src/fpga/KD/llapi/image.h deleted file mode 100644 index d01877397a..0000000000 --- a/mobile/src/fpga/KD/llapi/image.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -namespace paddle_mobile { -namespace zynqmp { -namespace image { - -void convert_to_hwc(float** data_in, int channel, int height, int width); -void align_element_conv(float** data_in, int height, int cw); -void format_image(float** data_in, int channel, int height, int width); - -// Concat featuremaps along channel direction -void concat_images(int16_t** images_in, float** scales_in, void* image_out, - float* scale_out, int image_num, uint32_t* channel_num, - int height, int width); - -// Split featuremap along channel direction -void split_image(int16_t* image_in, const float* scale_in, void** images_out, - float** scales_out, int image_num, - const uint32_t* channel_nums, int height, int width); -} // namespace image -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/llapi/zynqmp_api.cpp b/mobile/src/fpga/KD/llapi/zynqmp_api.cpp deleted file mode 100644 index ec6ee9f331..0000000000 --- a/mobile/src/fpga/KD/llapi/zynqmp_api.cpp +++ /dev/null @@ -1,384 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "fpga/KD/llapi/config.h" -#include "fpga/KD/llapi/zynqmp_api.h" - -namespace paddle_mobile { -namespace zynqmp { - -#define PADDLE_MOBILE_OS_LINUX - -static int fd = -1; -static const char *device_path = "/dev/fpgadrv0"; -static std::map memory_map; - -static size_t memory_size_max = 0; -static size_t memory_size = 0; - -static inline int do_ioctl(uint64_t req, const void *arg) { -#ifdef PADDLE_MOBILE_OS_LINUX - return ioctl(fd, req, arg); -#else - return -1; -#endif -} - -int open_device() { - std::cout << "open_device" << std::endl; - if (fd == -1) { - fd = open(device_path, O_RDWR); - } - std::cout << "open_device fd:" << fd << std::endl; - return fd; -} - -void close_device() { close(fd); } - -void reset_device() { - FpgaResetArgs args; - do_ioctl(IOCTL_FPGA_RESET, &args); -} - -// memory management; -void *fpga_malloc(size_t size) { -// std::cout << "fpga malloc: 0x" << std::hex << size << std::dec << " (" << -// size << ") - "; -#ifdef ENABLE_DEBUG -// std::cout << "fpga_malloc:" << size << std::endl; -#endif -#ifdef PADDLE_MOBILE_OS_LINUX - void *ptr = reinterpret_cast( - mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); - if (ptr == NULL) { - std::cout << "not enough memory !"; - exit(-1); - } - // std::cout << std::hex << ptr << std::dec << std::endl; - memory_map.insert(std::make_pair(ptr, size)); - memory_size += size; - if (memory_size > memory_size_max) { - memory_size_max = memory_size; - } - return ptr; -#else - return malloc(size); -#endif -} - -size_t fpga_get_memory_size(void *ptr) { return memory_map[ptr]; } - -size_t fpga_get_memory_size_max() { return memory_size_max; } - -size_t fpga_diagnose_memory(int detailed) { - size_t total = 0; - // size_t size = 0; - // int i = 0; - auto iter = memory_map.begin(); // std::map::iterator - while (iter != memory_map.end()) { - total += iter->second; - iter++; - } - return total; -} - -void fpga_free(void *ptr) { - size_t size = 0; - auto iter = memory_map.find(ptr); // std::map::iterator - if (iter != memory_map.end()) { - size = iter->second; - memory_map.erase(iter); - } - - memory_size -= size; - -#ifdef PADDLE_MOBILE_OS_LINUX - - munmap(ptr, size); -#else - free(ptr); -#endif -} - -void fpga_copy(void *dst, const void *src, int size) { memcpy(dst, src, size); } - -int fpga_flush(void *address, size_t size) { - struct MemoryCacheArgs args; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); -} - -int fpga_invalidate(void *address, size_t size) { - // std::cout << - // "==================================================================================" - // << std::endl; - struct MemoryCacheArgs args; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); -} - -int invalidate_cache(void *addr, int size) { - struct MemoryCacheArgs args; - args.address = addr; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); -} - -int flush_cache(void *addr, int size) { - struct MemoryCacheArgs args; - args.address = addr; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); -} - -void fpga_copy(void *dest, const void *src, size_t num) { - memcpy(dest, src, num); -} - -int ioctl_conv(const struct ConvArgs &args) { -#ifdef ENABLE_DEBUG -// std::cout << "======Compute Basic Conv======"; -// std::cout << " relu_enabled:" << args.relu_enabled -// << " sb_address:" << args.sb_address -// << " filter_address:" << args.filter_address -// << " filter_num:" << args.filter_num -// << " group_num:" << args.group_num; -// std::cout << " image_address:" << args.image.address -// << " image_scale_address:" << args.image.scale_address -// << " image_channels:" << args.image.channels -// << " image_height:" << args.image.height -// << " image_width:" << args.image.width -// << " pad_height:" << args.image.pad_height -// << " pad_width:" << args.image.pad_width; -// std::cout << " kernel_height:" << args.kernel.height -// << " kernel_width:" << args.kernel.width -// << " stride_h:" << args.kernel.stride_h -// << " stride_w:" << args.kernel.stride_w; -// std::cout << " out_address:" << args.output.address -// << " out_scale_address:" << args.output.scale_address; -// -// float* in_scale = (float*)args.image.scale_address; -// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] << -// std::endl; - -#endif - - return do_ioctl(IOCTL_CONFIG_CONV, &args); - - // return 0; -} - -int compute_fpga_conv_basic(const struct ConvArgs &args) { -#ifdef ENABLE_DEBUG - -// std::cout << "======Compute Basic Conv======"; -// std::cout << " relu_enabled:" << args.relu_enabled -// << " sb_address:" << args.sb_address -// << " filter_address:" << args.filter_address -// << " filter_num:" << args.filter_num -// << " group_num:" << args.group_num; -// std::cout << " image_address:" << args.image.address -// << " image_scale_address:" << args.image.scale_address -// << " image_channels:" << args.image.channels -// << " image_height:" << args.image.height -// << " image_width:" << args.image.width -// << " pad_height:" << args.image.pad_height -// << " pad_width:" << args.image.pad_width; -// std::cout << " kernel_height:" << args.kernel.height -// << " kernel_width:" << args.kernel.width -// << " stride_h:" << args.kernel.stride_h -// << " stride_w:" << args.kernel.stride_w; -// std::cout << " out_address:" << args.output.address -// << " out_scale_address:" << args.output.scale_address; - -// float *in_scale = (float *)args.image.scale_address; -// std::cout << " scale:" << in_scale[0] << "," << in_scale[1] << -// std::endl; - -// float *filter_scale = (float *)args.filter_scale_address; -// std::cout << " filter scale:" << filter_scale[0] << "," << -// filter_scale[1] << std::endl; - -#endif - return do_ioctl(IOCTL_CONFIG_CONV, &args); -} - -int compute_fpga_conv(const struct SplitConvArgs &args) { - // return do_ioctl(IOCTL_CONFIG_CONV, &args); - int split_num = args.split_num; - int ret = -1; - for (int i = 0; i < split_num; i++) { - // ComputeBasicConv(args.conv_args[i]); - ret = compute_fpga_conv_basic(args.conv_arg[i]); - } - - if (split_num > 1) { - std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl; - exit(-1); - } - return ret; -} - -int compute_fpga_pool(const struct PoolingArgs &args) { - return do_ioctl(IOCTL_CONFIG_POOLING, &args); -} - -int compute_fpga_ewadd(const struct EWAddArgs &args) { - return do_ioctl(IOCTL_CONFIG_EW, &args); -} - -int perform_bypass(const struct BypassArgs &args) { - int size = args.image.channels * args.image.width * args.image.height; - int max_size = 1 << 21; - - float times = 1.0 * size / max_size; - int count = static_cast(times); - - void *input_address = args.image.address; - int type_size = - args.input_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t); - - void *output_address = args.output.address; - int out_type_size = - args.output_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t); - - struct BypassArgs bypassArgs = args; - bypassArgs.image.width = 1; - bypassArgs.image.height = 1; - - // std::cout << "times:" << times << " count:" << count << std::endl; - - for (int i = 0; i < count; ++i) { - bypassArgs.image.channels = max_size; - bypassArgs.image.address = - reinterpret_cast(input_address + i * max_size * type_size); - bypassArgs.output.address = - reinterpret_cast(output_address + i * max_size * out_type_size); - int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); - if (ret != 0) { - return ret; - } - // std::cout << "@:" << i << " ret:" << ret << std::endl; - } - - int remainder = size - max_size * count; - // std::cout << "remainder:" << remainder << std::endl; - bypassArgs.image.channels = remainder; - bypassArgs.image.address = - reinterpret_cast(input_address + count * max_size * type_size); - bypassArgs.output.address = reinterpret_cast( - output_address + count * max_size * out_type_size); - return do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); -} - -int compute_fpga_concat(const struct ConcatArgs &args) { return -1; } - -int compute_fpga_scale(const struct ScaleArgs &args) { -#ifdef ENABLE_DEBUG - std::cout << "======Compute Scale======"; - std::cout << "scale_address:" << args.scale_address << std::endl; - std::cout << "bias_address:" << args.bias_address << std::endl; - - std::cout << "wc_alignment:" << args.wc_alignment << std::endl; - std::cout << "channel_alignment:" << args.channel_alignment << std::endl; - - std::cout << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - - std::cout << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; - -#endif - return do_ioctl(IOCTL_CONFIG_SCALE, &args); -} - -int compute_fpga_dwconv(const struct DWconvArgs &args) { - std::cout << "======Compute Basic Conv======"; - std::cout << " relu_enabled:" << args.relu_enabled - << " filter_address:" << args.filter_address; - std::cout << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - std::cout << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - std::cout << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; - - // float *in_scale = (float *)args.image.scale_address; - // std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] << - // std::endl; - - return do_ioctl(IOCTL_CONFIG_DWCONV, &args); -} - -// int config_power(const struct PowerArgs& args) { -// return do_ioctl(IOCTL_CONFIG_POWER, &args); -// } - -int config_inplace(const struct InplaceArgs &args) { - return do_ioctl(IOCTL_CONFIG_INPLACE, &args); -} - -// uint64_t vaddr_to_paddr(void *address) { -// return 0; -// } - -int16_t fp32_2_fp16(float fp32_num) { - unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT - auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | - (((tmp & 0x7f800000) >> 13) - (112 << 10))); - if (tmp & 0x1000) { - t++; // roundoff - } - return t; -} - -float fp16_2_fp32(int16_t fp16_num) { - if (0 == fp16_num) { - return 0; - } - int frac = (fp16_num & 0x3ff); - int exp = ((fp16_num & 0x7c00) >> 10) + 112; - int s = fp16_num & 0x8000; - int tmp = 0; - float fp32_num = 0; - tmp = s << 16 | exp << 23 | frac << 13; - fp32_num = *(float *)&tmp; // NOLINT - return fp32_num; -} - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/llapi/zynqmp_api.h b/mobile/src/fpga/KD/llapi/zynqmp_api.h deleted file mode 100644 index 89d9754903..0000000000 --- a/mobile/src/fpga/KD/llapi/zynqmp_api.h +++ /dev/null @@ -1,329 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H -#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H - -#include -#include -#include -#include - -namespace paddle_mobile { -namespace zynqmp { - -typedef int16_t half; - -#define IMAGE_ALIGNMENT 16 // Aligned to 16 -#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 -#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 -#define BS_NUM_ALIGNMENT 8 -#define BIAS_NUM_ALIGNMENT 16 - -enum DDataType { - DATA_TYPE_FP32 = 1, - DATA_TYPE_FP16 = 0, -}; - -enum DLayoutType { - LAYOUT_CHW = 1, - LAYOUT_HWC = 0, -}; - -struct VersionArgs { - void* buffer; -}; - -struct MemoryCopyArgs { - void* src; - void* dest; - size_t size; -}; - -struct MemoryCacheArgs { - void* address; - size_t size; -}; - -struct MemoryBarrierArgs {}; - -struct BNArgs { - bool enabled; - void* bias_address; - void* scale_address; -}; - -/** -Conv and Pooling kernel -*/ -struct KernelArgs { - uint32_t width; - uint32_t height; - uint32_t stride_w; - uint32_t stride_h; -}; - -struct ImageInputArgs { - void* address; // input featuremap virtual address - void* scale_address; // input scale address; - uint32_t channels; - uint32_t width; // featuremap width - uint32_t height; - uint32_t pad_width; // padding width; - uint32_t pad_height; -}; - -struct ImageOutputArgs { - void* address; // output result address; - float* scale_address; // output scale address; -}; - -struct ConvArgs { - bool relu_enabled; - void* sb_address; // scale and bias are interlaced; - void* filter_address; - void* filter_scale_address; - uint32_t filter_num; - uint32_t group_num; - - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct DWconvArgs { - bool relu_enabled; - void* bias_address; - void* filter_address; - struct KernelArgs kernel; - struct ImageInputArgs image; - struct ImageOutputArgs output; - uint16_t out_width; - uint16_t out_height; - uint16_t sub_conv_num; -}; - -struct PoolingArgs { - uint16_t mode; - uint16_t kernel_reciprocal; - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; - uint16_t out_width; - uint16_t out_height; -}; - -// elementwise add arguments -struct EWAddArgs { - bool relu_enabled; - - uint32_t const0; // output0 = const0 x input0 + const1 x input1; - uint32_t const1; - struct ImageInputArgs image0; - struct ImageInputArgs image1; - struct ImageOutputArgs output; -}; - -struct BypassArgs { - enum DDataType input_data_type; - enum DDataType output_data_type; - enum DLayoutType input_layout_type; - enum DLayoutType output_layout_type; - struct ImageInputArgs image; - struct ImageOutputArgs output; -}; - -struct ScaleArgs { - void* scale_address; - void* bias_address; - uint32_t wc_alignment; - uint32_t channel_alignment; - - struct ImageInputArgs image; - struct ImageOutputArgs output; -}; - -struct NormalizeArgs { - void* input_image_address; - void* output_image_address; - uint32_t image_width; - uint32_t image_height; - uint32_t image_channel; - uint32_t* output_scale_address; -}; - -struct PowerParameterArgs { - uint16_t shift; - uint16_t scale; - uint16_t power; -}; - -struct NormalizeParameterArgs { - uint32_t channel; - uint32_t hight_width; -}; - -struct InplaceArgs { - bool relu_enable; - bool power_enable; - bool normalize_enable; -}; - -struct FpgaRegWriteArgs { - uint64_t address; // - uint64_t value; -}; - -struct FpgaRegReadArgs { - uint64_t address; - uint64_t value; -}; - -struct FpgaResetArgs {}; - -#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4) - -#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs) - -#define IOCTL_SEPARATOR_0 10 - -#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) -#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) -#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) -#define IOCTL_MEMORY_BARRIER \ - _IOW(IOCTL_FPGA_MAGIC, 14, struct MemoryBarrierArgs) - -#define IOCTL_SEPARATOR_1 20 - -#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs) -#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs) -#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs) -#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs) -#define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs) -#define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs) - -#define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs) - -#define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs) -#define IOCTL_CONFIG_POWER_PARAMETER \ - _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs) -#define IOCTL_CONFIG_NORMALIZE_PARAMETER \ - _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs) -#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs) -#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs) -#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs) - -//============================== API ============================= - -// struct DWconvArgs { -// bool relu_enabled; -// void* bias_address; -// void* filter_address; -// struct KernelArgs kernel; -// struct ImageInputArgs image; -// struct ImageOutputArgs output; -// }; - -struct DeconvArgs { - uint32_t sub_conv_num; - uint32_t group_num; - uint32_t filter_num; - uint32_t omit_size; - uint32_t sub_output_width; - uint32_t sub_output_height; - struct ImageOutputArgs output; - struct SplitConvArgs* split_conv_args; -}; - -struct SplitArgs { - uint32_t image_num; - int16_t* image_in; - float* scale_in; - void** images_out; - float** scales_out; - uint32_t* out_channel_nums; - uint32_t height; - uint32_t width; -}; - -struct ConcatArgs { - uint32_t image_num; - half** images_in; - float** scales_in; - void* image_out; - float* scale_out; - uint32_t* channel_num; - uint32_t height; - uint32_t width; -}; - -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_arg; - struct ConcatArgs concat_arg; -}; - -struct GroupConvArgs { - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct SplitConvArgs* conv_args; - struct ConcatArgs concat_arg; -}; - -inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } -int open_device(); -void close_device(); - -void reset_device(); - -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); -size_t fpga_get_memory_size(void* ptr); -size_t fpga_get_memory_size_max(); -size_t fpga_diagnose_memory(int detailed); - -void fpga_copy(void* dst, const void* src, int size); - -int fpga_flush(void* address, size_t size); -int fpga_invalidate(void* address, size_t size); - -int perform_bypass(const struct BypassArgs& args); -int compute_fpga_conv_basic(const struct ConvArgs& args); -int compute_fpga_conv(const struct SplitConvArgs& args); -int compute_fpga_pool(const struct PoolingArgs& args); -int compute_fpga_ewadd(const struct EWAddArgs& args); -int compute_fpga_scale(const struct ScaleArgs& args); -int compute_fpga_concat(const struct ConcatArgs& args); -int config_power(const struct PowerArgs& args); -int compute_fpga_dwconv(const struct DWconvArgs& args); - -// int config_relu(const struct ReluArgs& args); - -int config_inplace(const struct InplaceArgs& args); - -int flush_cache(void* addr, int size); -int invalidate_cache(void* addr, int size); - -int16_t fp32_2_fp16(float fp32_num); -float fp16_2_fp32(int16_t fp16_num); -} // namespace zynqmp -} // namespace paddle_mobile - -#endif // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H diff --git a/mobile/src/fpga/KD/pe.hpp b/mobile/src/fpga/KD/pe.hpp deleted file mode 100644 index e2be6b3610..0000000000 --- a/mobile/src/fpga/KD/pe.hpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef PE_hpp -#define PE_hpp - -#include -#include -#include "pe_params.hpp" -#include "tensor_util.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -class PE { - public: - virtual bool init() { return false; } - - virtual void apply() {} - - virtual bool dispatch() { - std::cout << "pe dispatch \n"; - return false; - } - - virtual ~PE() {} -}; - -} // namespace zynqmp -} // namespace paddle_mobile - -#endif /* PE_hpp */ diff --git a/mobile/src/fpga/KD/pe_params.hpp b/mobile/src/fpga/KD/pe_params.hpp deleted file mode 100644 index f9a495fad8..0000000000 --- a/mobile/src/fpga/KD/pe_params.hpp +++ /dev/null @@ -1,179 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef PEParams_hpp -#define PEParams_hpp - -#include -#include - -#include "llapi/zynqmp_api.h" -#include "tensor.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -struct PEParam {}; - -struct InputParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; -}; - -struct OutputParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; -}; - -struct ReLUParam : PEParam { - public: - bool enabled = false; -}; - -struct BatchnormParam : PEParam { - public: - Tensor* bias = nullptr; - Tensor* scale = nullptr; - Tensor* mean = nullptr; - Tensor* variance = nullptr; - float epsilon = 0; -}; - -struct BasicConvParam { - Tensor output; - Tensor filter; - Tensor scaleBias; - ConvArgs args; -}; - -struct ConvParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; - Tensor* filter = nullptr; - BatchnormParam* batchnorm = nullptr; - ReLUParam relu; - int groups = 1; - std::vector strides; - std::vector paddings; - std::vector kernelSize; - std::vector dilations; - - Tensor* scale() { return scale_; } - - Tensor* bias() { return bias_; } - - // Tensor* quantizedFilter() { - // return quantizedFilter_; - // } - - std::vector& splitParams() { return splitParams_; } - - protected: - std::vector splitParams_; - // Tensor* quantizedFilter_ = new Tensor(); - Tensor* scale_ = new Tensor(); - Tensor* bias_ = new Tensor(); -}; - -struct DepthwiseConvParam : ConvParam { - public: - Tensor* quantizedFilter() { return quantizedFilter_; } - - DWconvArgs args; - - protected: - Tensor* quantizedFilter_ = new Tensor(); -}; - -enum PoolingType : int { - MAX = 0, - AVERAGE = 1, -}; - -struct PoolingParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; - - PoolingType type = PoolingType::MAX; - bool globalPooling = false; - std::vector kernelSize; - std::vector strides; - std::vector paddings; - - PoolingArgs poolingArgs = {0}; -}; - -struct ConcatParam : PEParam { - public: - std::vector inputs; - Tensor* output; - int axis = 0; -}; - -struct ElementwiseAddParam : PEParam { - public: - std::vector inputs; - Tensor* output = nullptr; - int axis = 0; - ReLUParam relu; - - EWAddArgs ewargs; -}; - -struct FullyConnectedParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* filter = nullptr; - Tensor* bias = nullptr; - Tensor* output = nullptr; - - Tensor* quantizedFilter() { return quantizedFilter_; } - - Tensor* biasScale() { return biasScale_; } - - SplitConvArgs convArgs; - - protected: - Tensor* quantizedFilter_ = new Tensor(); - Tensor* biasScale_ = new Tensor(); -}; - -struct SoftmaxParam : PEParam { - public: - Tensor* input = nullptr; - - Tensor* output = nullptr; - - private: - Tensor* floatInput = nullptr; -}; -struct NormParam : PEParam { - public: - Tensor* input = nullptr; - - Tensor* output = nullptr; - - private: - Tensor* floatInput = nullptr; -}; -} // namespace zynqmp -} // namespace paddle_mobile - -#endif /* PEParams_hpp */ diff --git a/mobile/src/fpga/KD/pes/concat_pe.hpp b/mobile/src/fpga/KD/pes/concat_pe.hpp deleted file mode 100644 index 54169ad5d2..0000000000 --- a/mobile/src/fpga/KD/pes/concat_pe.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "../pe.hpp" -#include "../pe_params.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -class ConcatPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - return true; - } - - void apply() {} - - bool dispatch() { - Tensor* output = param_.output; - Shape& output_shape = output->shape(); - float16* out_data = param_.output->data(); - - int channel_sum = 0; - int out_channel = output_shape.channel(); - float scale = 0; - for (int n = 0; n < param_.inputs.size(); n++) { - Tensor* input = param_.inputs[n]; - input->invalidate(); - scale = std::max(scale, input->scale()[0]); - Shape& input_shape = input->shape(); - int wh = output_shape.width() * output_shape.height(); - for (int j = 0; j < wh; j++) { - float16* src = input->data() + j * input_shape.channel(); - memcpy(out_data + j * out_channel + channel_sum, src, - input_shape.channel() * sizeof(float16)); - } - channel_sum += input_shape.channel(); - } - output->scale()[0] = scale; - output->scale()[1] = 1.0f / scale; - std::cout << "conv scale::" << scale << std::endl; - output->flush(); - return true; - } - - ConcatParam& param() { return param_; } - - private: - ConcatParam param_; -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/conv_pe.hpp b/mobile/src/fpga/KD/pes/conv_pe.hpp deleted file mode 100644 index 5ef89e920e..0000000000 --- a/mobile/src/fpga/KD/pes/conv_pe.hpp +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "../llapi/image.h" -#include "../pe.hpp" -#include "../pe_params.hpp" -#include "concat_pe.hpp" -#include "conv_pe.hpp" -#include "conv_process.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -class ConvPE : public PE { - public: - bool init() { - std::cout << "Conv init" << std::endl; - return true; - } - - void apply() { - // process scale and bias; - BatchnormParam* bn = param_.batchnorm; - int channel = param_.output->shape().channel(); - Shape sb_shape(N, {channel}); - float* new_scale_ptr = param_.scale()->mutableData(FP32, sb_shape); - float* new_bias_ptr = param_.bias()->mutableData(FP32, sb_shape); - if (bn != nullptr) { - float* bn_scale_ptr = bn->scale->data(); - float* bn_bias_ptr = bn->bias->data(); - float* bn_var_ptr = bn->variance->data(); - float* bn_mean_ptr = bn->mean->data(); - float epsilon = bn->epsilon; - for (int i = 0; i < channel; i++) { - float new_scale = - bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_scale_ptr[i] = new_scale; - new_bias_ptr[i] = - bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - } - } else { - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = 1.0f; - new_bias_ptr[i] = 0.0f; - } - } - fill_split_arg(param_); - if (param_.splitParams().size() > 1) { - ConcatParam& concat_param = concatPE_.param(); - for (auto conv_param : param_.splitParams()) { - concat_param.inputs.push_back(&conv_param->output); - } - concat_param.output = param_.output; - concatPE_.init(); - concatPE_.apply(); - } - } - - bool dispatch() { - std::vector& params = param_.splitParams(); - int ret = 0; - for (auto conv_param : params) { - ret |= compute_fpga_conv_basic(conv_param->args); - } - size_t size = params.size(); - if (ret == 0 && size > 1) { - concatPE_.dispatch(); - } - return ret == 0; - } - - ConvParam& param() { return param_; } - - private: - ConvParam param_; - ConcatPE concatPE_; -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/conv_process.hpp b/mobile/src/fpga/KD/pes/conv_process.hpp deleted file mode 100644 index 13bcaccabd..0000000000 --- a/mobile/src/fpga/KD/pes/conv_process.hpp +++ /dev/null @@ -1,374 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef conv_process_hpp -#define conv_process_hpp - -#include -#include -#include - -#include "../float16.hpp" -#include "../llapi/bias_scale.h" -#include "../llapi/filter.h" -#include "../llapi/image.h" -#include "../tensor.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -inline int get_aligned_filter_element_num(int chw) { - return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); -} - -inline int get_filter_num_per_div(Tensor* filter, int group_num) { - auto chw = filter->shape().channel() * filter->shape().height() * - filter->shape().width(); - auto num = filter->shape().num(); - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_num_per_div(num, group_num, div_capacity); -} - -inline int get_split_num(Tensor* filter) { - auto chw = filter->shape().channel() * filter->shape().height() * - filter->shape().width(); - auto num = filter->shape().num(); - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_split_num(num, div_capacity); -} - -inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter, - Tensor* scale_bias, int group) { - float* scale_data = nullptr; - float* bias_data = nullptr; - if (scale != nullptr) { - scale_data = scale->data(); - } - if (bias != nullptr) { - bias_data = bias->data(); - } - int channel = filter->shape().num(); - Shape bias_scale_shape(N, {2 * channel}); - float* bs_data = scale_bias->mutableData(FP32, bias_scale_shape); - for (int i = 0; i < channel; i++) { - float scale_value = scale_data == nullptr ? 1 : scale_data[i]; - float bias_value = bias_data == nullptr ? 0 : bias_data[i]; - bs_data[i + channel] = scale_value; - bs_data[i] = bias_value; - } - - int element_num_per_div = get_filter_num_per_div(filter, group); - bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel); -} - -inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) { - float max_value = find_max(*filter); - Shape& filter_shape = filter->shape(); - quantized_filter->setAligned(true); - quantized_filter->mutableData(INT8, filter->shape()); - quantized_filter->scale()[0] = max_value / 127.0f; - quantized_filter->scale()[1] = 127.0f / max_value; - - auto memory_size = filter->shape().memorySize(sizeof(float)); - auto new_data = reinterpret_cast(fpga_malloc(memory_size)); - memcpy(new_data, filter->data(), memory_size); - size_t mem_size = filter::format_filter( - &new_data, filter_shape.num(), filter_shape.channel(), - filter_shape.height(), filter_shape.width(), group, max_value); - int8_t* src = quantized_filter->mutableData(INT8, filter->shape()); - memcpy(src, new_data, mem_size); - fpga_free(new_data); - quantized_filter->flush(); -} - -inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter, - float* scale) { - int num = filter->shape().num(); - int height = filter->shape().height(); - int width = filter->shape().width(); - auto memory_size = filter->shape().memorySize(sizeof(float)); - auto new_data = (float*)fpga_malloc(memory_size); // NOLINT - memcpy(new_data, filter->data(), memory_size); - - filter::format_dwconv_filter(&new_data, num, height, width, scale); - float16* src = quantized_filter->mutableData(FP16, filter->shape()); - memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(float16))); - quantized_filter->flush(); - - fpga_free(new_data); -} - -inline void format_fc_filter(Tensor* filter, Tensor* quantized_filter) { - float max_value = find_max(*filter); - Shape& filter_shape = filter->shape(); - quantized_filter->setAligned(true); - quantized_filter->mutableData(INT8, filter->shape()); - quantized_filter->scale()[0] = max_value / 127.0f; - quantized_filter->scale()[1] = 127.0f / max_value; - - size_t memory_size = filter->shape().memorySize(sizeof(float)); - auto new_data = (float*)fpga_malloc(memory_size); // NOLINT - memcpy(new_data, filter->data(), memory_size); - filter::format_fc_filter(&new_data, filter_shape.num(), - filter_shape.channel(), filter_shape.height(), - filter_shape.width(), 1, max_value); - - int8_t* src = quantized_filter->mutableData(INT8, filter->shape()); - memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(int8_t))); - quantized_filter->flush(); - fpga_free(new_data); -} - -inline void fill_split_arg(const ConvParam& c_param) { - ConvParam& param = const_cast(c_param); - Tensor* input = param.input; - Tensor* out = param.output; - Tensor* filter = param.filter; - auto channel = out->shape().channel(); - - int split_num = param.groups == 1 ? get_split_num(param.filter) : 1; - int filter_num_per_div = get_filter_num_per_div(filter, param.groups); - int element_num = get_aligned_filter_element_num(filter->shape().channel() * - filter->shape().height() * - filter->shape().width()); - - Shape& out_shape = out->shape(); - for (int i = 0; i < split_num; i++) { - BasicConvParam* conv_param = new BasicConvParam(); - - int filter_num = filter->shape().num(); - float16* out_address = nullptr; - int8_t* filter_address = nullptr; - float* sb_address = nullptr; - float* out_scale_address = nullptr; - - ConvArgs& args = conv_param->args; - - if (split_num == 1) { - out_address = out->data(); - out_scale_address = out->scale(); - } - filter_num = i == split_num - 1 - ? channel - (split_num - 1) * filter_num_per_div // NOLINT - : filter_num_per_div; - if (split_num != 1) { - Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num}); - out_address = conv_param->output.mutableData(FP16, shape); - out_scale_address = conv_param->output.scale(); - } - Shape f_shape(NCHW, {filter_num, filter->shape().channel(), - filter->shape().height(), filter->shape().width()}); - - Tensor new_filter; - float* new_filter_data = new_filter.mutableData(FP32, f_shape); - int filter_hwc = filter->shape().height() * filter->shape().width() * - filter->shape().channel(); - memcpy(new_filter_data, - filter->data() + i * filter_num_per_div * filter_hwc, - filter_num * filter_hwc * sizeof(float)); - new_filter.flush(); - conv_param->filter.mutableData(FP32, f_shape); - format_filter(&new_filter, &(conv_param->filter), param.groups); - filter_address = conv_param->filter.data(); - std::cout << conv_param->filter.scale()[0] << std::endl; - args.filter_scale_address = conv_param->filter.scale(); - - int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT); - Tensor scale; - Tensor bias; - - int chnnnel_start = i * filter_num_per_div; - - Shape s_shape(N, {filter_num}); - float* scale_data = scale.mutableData(FP32, s_shape); - float* bias_data = bias.mutableData(FP32, s_shape); - for (int i = 0; i < filter_num; i++) { - scale_data[i] = param.scale()->data()[i + chnnnel_start]; - } - for (int i = 0; i < filter_num; i++) { - // bias_data[i] = 0.0f;//TODO - bias_data[i] = param.bias()->data()[i + chnnnel_start]; - } - Shape sb_shape(N, {sb_num}); - format_scale_bias(&scale, &bias, &conv_param->filter, - &conv_param->scaleBias, param.groups); - sb_address = conv_param->scaleBias.mutableData(FP32, sb_shape); - - args.group_num = param.groups; - args.relu_enabled = param.relu.enabled; - args.sb_address = sb_address; - args.kernel.stride_h = param.strides[1]; - args.kernel.stride_w = param.strides[0]; - args.kernel.height = new_filter.shape().height(); - args.kernel.width = new_filter.shape().width(); - - args.filter_address = filter_address; - args.filter_num = filter_num; - - args.image.address = input->data(); - args.image.scale_address = input->scale(); - args.image.channels = input->shape().channel(); - args.image.width = input->shape().width(); - args.image.height = input->shape().height(); - args.image.pad_width = param.paddings[0]; - args.image.pad_height = param.paddings[1]; - - args.output.address = out_address; - args.output.scale_address = out_scale_address; - param.splitParams().push_back(conv_param); - } -} - -inline void fill_split_arg(struct SplitConvArgs* arg, Tensor* input, - Tensor* out, Tensor* filter, bool relu_enabled, - int group_num, int stride_h, int stride_w, - int padding_h, int padding_w, float* bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); - auto out_ptr = out->data(); - - arg->group_num = (uint32_t)group_num; - arg->split_num = group_num == 1 ? get_split_num(filter) : 1; - arg->filter_num = filter->shape().num(); - arg->output.address = out_ptr; - arg->output.scale_address = out->scale(); - arg->conv_arg = - (ConvArgs*)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT - - memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs)); - - arg->concat_arg.image_num = arg->split_num; - arg->concat_arg.image_out = out_ptr; - arg->concat_arg.scale_out = out->scale(); - arg->concat_arg.height = out->shape().height(); - arg->concat_arg.width = out->shape().width(); - - int n = arg->split_num; - arg->concat_arg.images_in = (half**)fpga_malloc(n * sizeof(int*)); // NOLINT - arg->concat_arg.scales_in = - (float**)fpga_malloc(n * sizeof(float*)); // NOLINT - arg->concat_arg.channel_num = - (uint32_t*)fpga_malloc(n * sizeof(uint32_t)); // NOLINT - - auto channel = out->shape().channel(); - int filter_num_per_div = get_filter_num_per_div(filter, group_num); - int element_num = get_aligned_filter_element_num(filter->shape().channel() * - filter->shape().height() * - filter->shape().width()); - - for (int i = 0; i < n; i++) { - arg->conv_arg[i].relu_enabled = relu_enabled; - arg->conv_arg[i].group_num = (uint32_t)group_num; - arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; - arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; - arg->conv_arg[i].kernel.height = filter->shape().height(); - arg->conv_arg[i].kernel.width = filter->shape().width(); - arg->conv_arg[i].image.address = input_ptr; - arg->conv_arg[i].image.channels = input->shape().channel(); - arg->conv_arg[i].image.height = input->shape().height(); - arg->conv_arg[i].image.width = input->shape().width(); - arg->conv_arg[i].image.scale_address = input->scale(); - arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; - arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; - arg->conv_arg[i].filter_scale_address = filter->scale(); - arg->conv_arg[i].filter_num = (uint32_t)( - i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT - : filter_num_per_div); - - size_t filter_size = - element_num * - align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) * - sizeof(int8_t); - auto filter_head = - &((int8_t*)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT - arg->conv_arg[i].filter_address = fpga_malloc(filter_size); - memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); - fpga_flush(arg->conv_arg[i].filter_address, filter_size); - - size_t bs_size = 2 * - align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) * - sizeof(float); - auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; - arg->conv_arg[i].sb_address = fpga_malloc(bs_size); - memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); - fpga_flush(arg->conv_arg[i].sb_address, bs_size); - - if (n > 1) { - arg->conv_arg[i].output.scale_address = - (float*)fpga_malloc(2 * sizeof(float)); // NOLINT - arg->conv_arg[i].output.address = fpga_malloc( - out->shape().height() * - align_to_x(out->shape().width() * arg->conv_arg[i].filter_num, - IMAGE_ALIGNMENT) * - sizeof(half)); - } else { - arg->conv_arg[i].output.scale_address = out->scale(); - arg->conv_arg[i].output.address = out_ptr; - } - - arg->concat_arg.images_in[i] = - (half*)arg->conv_arg[i].output.address; // NOLINT - arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; - arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; - } -} - -inline int do_concat(const struct ConcatArgs& args) { - image::concat_images(args.images_in, args.scales_in, args.image_out, - args.scale_out, args.image_num, args.channel_num, - args.height, args.width); - return 0; -} - -inline bool compute_conv(const ConvParam& c_conv_params) { - ConvParam& conv_params = const_cast(c_conv_params); - std::vector& params = conv_params.splitParams(); - int ret = 0; - for (auto conv_param : params) { - ret |= compute_fpga_conv_basic(conv_param->args); - } - size_t size = params.size(); - if (ret == 0 && size > 1) { - Tensor* output = conv_params.output; - - Tensor& img = params[0]->output; - for (int i = 0; i < 1; i++) { - for (int i = 0; i < img.shape().numel(); i++) { - float value = half_to_float(img.data()[i]); - std::cout << "value:" << value << std::endl; - } - } - } - return ret == 0; -} - -inline bool compute_conv(const SplitConvArgs& args) { - int ret = 0; - int split_num = args.split_num; - for (int i = 0; i < split_num; i++) { - ret |= compute_fpga_conv_basic(args.conv_arg[i]); - } - - if (split_num > 1) { - do_concat(args.concat_arg); - } - return ret == 0; -} - -} // namespace zynqmp -} // namespace paddle_mobile - -#endif /* conv_process_hpp */ diff --git a/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp b/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp deleted file mode 100644 index 43dbb4f4a1..0000000000 --- a/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "../float16.hpp" -#include "../pe.hpp" -#include "../pe_params.hpp" -#include "conv_process.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -class DepthwiseConvPE : public PE { - public: - bool init() { - std::cout << "DWConv init" << std::endl; - return true; - } - - void apply() { - DepthwiseConvParam& param = param_; - Tensor* input = param.input; - Tensor* output = param.output; - int channel = output->shape().channel(); - - Tensor* new_scale = param.scale(); - Tensor* new_bias = param.bias(); - Shape shape(NC, {channel, 1}); - float* new_scale_data = new_scale->mutableData(FP32, shape); - float16* new_bias_data = new_bias->mutableData(FP16, shape); - - BatchnormParam* batchnorm = param.batchnorm; - memset(new_scale_data, 0, new_scale->shape().memorySize(sizeof(float16))); - memset(new_bias_data, 0, new_bias->shape().memorySize(sizeof(float16))); - if (batchnorm != nullptr) { - for (size_t i = 0; i < channel; i++) { - // TODO(chonwhite) combine; - } - } else { - float16 zero = float_to_half(0.0f); - for (size_t i = 0; i < channel; i++) { - new_bias_data[i] = zero; - new_scale_data[i] = 1.0f; - } - } - - Tensor* quantized_filter = param.quantizedFilter(); - quantized_filter->mutableData(FP16, param.filter->shape()); - format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data); - - DWconvArgs args = {0}; - - void* filter_address = quantized_filter->data(); - std::cout << "filter:" << filter_address; - - args.bias_address = new_bias_data; - args.filter_address = param.quantizedFilter()->data(); - args.kernel.width = param.kernelSize[0]; - args.kernel.height = param.kernelSize[1]; - args.kernel.stride_w = param.strides[0]; - args.kernel.stride_h = param.strides[1]; - args.image.address = input->data(); - args.image.channels = input->shape().channel(); - args.image.height = input->shape().height(); - args.image.width = input->shape().width(); - args.image.pad_width = param.paddings[0]; - args.image.pad_height = param.paddings[1]; - args.image.scale_address = input->scale(); - args.output.address = output->data(); - args.output.scale_address = output->scale(); - args.out_width = param.output->shape().width(); - args.out_height = param.output->shape().height(); - args.sub_conv_num = 1; - param.args = args; - } - - bool dispatch() { return compute_fpga_dwconv(param_.args) == 0; } - - DepthwiseConvParam& param() { return param_; } - - private: - DepthwiseConvParam param_; -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp b/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp deleted file mode 100644 index c4fab49a3d..0000000000 --- a/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "../pe.hpp" -#include "../pe_params.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -class ElementwiseAddPE : public PE { - public: - bool init() { return true; } - - void apply() { - Tensor* input0 = param_.inputs[0]; - Tensor* input1 = param_.inputs[1]; - Tensor* output = param_.output; - EWAddArgs args = {0}; - args.const0 = 0x3c00; - args.const1 = 0x3c00; // =1 - args.image0.address = input0->data(); - args.image0.channels = input0->shape().channel(); - args.image0.scale_address = input0->scale(); - args.image0.height = input0->shape().height(); - args.image0.width = input0->shape().width(); - args.image0.pad_height = 0; - args.image0.pad_width = 0; - args.image1.address = input1->data(); - args.image1.channels = input1->shape().channel(); - args.image1.scale_address = input1->scale(); - args.image1.height = input1->shape().height(); - args.image1.width = input1->shape().width(); - args.image1.pad_height = 0; - args.image1.pad_width = 0; - args.output.scale_address = output->scale(); - args.output.address = output->data(); - param_.ewargs = args; - } - - bool dispatch() { - InplaceArgs inplace_args = {0}; - if (param_.relu.enabled) { - inplace_args.relu_enable = true; - config_inplace(inplace_args); - } - compute_fpga_ewadd(param_.ewargs); - if (param_.relu.enabled) { - inplace_args.relu_enable = false; - config_inplace(inplace_args); - } - return true; - } - - ElementwiseAddParam& param() { return param_; } - - private: - ElementwiseAddParam param_; -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/fully_connected_pe.hpp b/mobile/src/fpga/KD/pes/fully_connected_pe.hpp deleted file mode 100644 index 0082cf0aa9..0000000000 --- a/mobile/src/fpga/KD/pes/fully_connected_pe.hpp +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "../pe.hpp" -#include "../pe_params.hpp" -#include "conv_process.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -class FullyConnectedPE : public PE { - public: - bool init() { return true; } - - void apply() { - Tensor* input = param_.input; - Tensor* output = param_.output; - - convParam_.input = param_.input; - convParam_.output = param_.output; - // convParam_.relu = param_.relu; - convParam_.groups = 1; - convParam_.strides = {1, 1}; - convParam_.paddings = {0, 0}; - convParam_.kernelSize = {input->shape().width(), input->shape().height()}; - convParam_.dilations = {1, 1}; - - int num = param_.filter->shape().channel(); - int chw = param_.filter->shape().num(); - - int height = param_.input->shape().height(); - int width = param_.input->shape().width(); - int filter_channel = chw / height / width; - - int channel = param_.output->shape().channel(); - Shape shape(NCHW, {num, filter_channel, height, width}); - Tensor* conv_filter = new Tensor(); - float* new_filter_data = conv_filter->mutableData(FP32, shape); - float* filter_data = param_.filter->data(); - - for (int i = 0; i < num; i++) { - float sum = 0; - float* f_start = filter_data + i * chw; - for (int j = 0; j < chw; j++) { - float scale = filter_data[j * num + i]; - new_filter_data[i * chw + j] = scale; - } - } - - conv_filter->flush(); - convParam_.filter = conv_filter; - - Shape sb_shape(N, {channel}); - float* scale_data = convParam_.scale()->mutableData(FP32, sb_shape); - float* bias_data = convParam_.bias()->mutableData(FP32, sb_shape); - - for (int i = 0; i < channel; i++) { - scale_data[i] = 1.0f; - bias_data[i] = param_.bias->data()[i]; - } - - fill_split_arg(convParam_); - } - - bool dispatch() { - int ret = 0; - std::vector& params = convParam_.splitParams(); - - for (auto conv_param : params) { - std::cout << "conv basic \n"; - ret |= compute_fpga_conv_basic(conv_param->args); - } - return ret == 0; - } - - FullyConnectedParam& param() { return param_; } - - private: - FullyConnectedParam param_; - ConvParam convParam_; -}; -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/input_pe.hpp b/mobile/src/fpga/KD/pes/input_pe.hpp deleted file mode 100644 index ad3187c1f9..0000000000 --- a/mobile/src/fpga/KD/pes/input_pe.hpp +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "../pe.hpp" -#include "../pe_params.hpp" -namespace paddle_mobile { -namespace zynqmp { - -class InputPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - return true; - } - - bool dispatch() { - std::cout << "InputPE dispatch \n"; - Tensor* input = param_.input; - Tensor* output = param_.output; - - Tensor* src = input; - Tensor half_tensor; - if (input->dataType() == DataType::FP32) { - half_tensor.mutableData(DataType::FP16, input->shape()); - half_tensor.copyFrom(input); - src = &half_tensor; - } - output->mutableData(); - src->alignImage(output, true); - return true; - } - - InputParam& param() { return param_; } - - private: - InputParam param_; -}; -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/math_func_neon.h b/mobile/src/fpga/KD/pes/math_func_neon.h deleted file mode 100755 index f34e30036c..0000000000 --- a/mobile/src/fpga/KD/pes/math_func_neon.h +++ /dev/null @@ -1,330 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* NEON implementation of sin, cos, exp and log - * - * Inspired by Intel Approximate Math library, and based on the - * corresponding algorithms of the cephes math library - */ - -/* Copyright (C) 2011 Julien Pommier - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - * - * (this is the zlib license) - */ - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#pragma once - -#include - -static const int32_t c_inv_mant_mask = ~0x7f800000u; -static const float c_cephes_SQRTHF = 0.707106781186547524; -static const float c_cephes_log_p0 = 7.0376836292E-2; -static const float c_cephes_log_p1 = -1.1514610310E-1; -static const float c_cephes_log_p2 = 1.1676998740E-1; -static const float c_cephes_log_p3 = -1.2420140846E-1; -static const float c_cephes_log_p4 = +1.4249322787E-1; -static const float c_cephes_log_p5 = -1.6668057665E-1; -static const float c_cephes_log_p6 = +2.0000714765E-1; -static const float c_cephes_log_p7 = -2.4999993993E-1; -static const float c_cephes_log_p8 = +3.3333331174E-1; -static const float c_cephes_log_q1 = -2.12194440e-4; -static const float c_cephes_log_q2 = 0.693359375; - -/* natural logarithm computed for 4 simultaneous float - * return NaN for x <= 0 - */ -static inline float32x4_t log_ps(float32x4_t x) { - float32x4_t one = vdupq_n_f32(1); - - x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ - uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); - - int32x4_t ux = vreinterpretq_s32_f32(x); - - int32x4_t emm0 = vshrq_n_s32(ux, 23); - - /* keep only the fractional part */ - ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); - ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); - x = vreinterpretq_f32_s32(ux); - - emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); - float32x4_t e = vcvtq_f32_s32(emm0); - - e = vaddq_f32(e, one); - - /* part2: - * if( x < SQRTHF ) { - * e -= 1; - * x = x + x - 1.0; - * } else { x = x - 1.0; } - */ - uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); - float32x4_t tmp = - vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); - x = vsubq_f32(x, one); - e = vsubq_f32( - e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); - x = vaddq_f32(x, tmp); - - float32x4_t z = vmulq_f32(x, x); - - float32x4_t y = vdupq_n_f32(c_cephes_log_p0); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); - y = vmulq_f32(y, x); - - y = vmulq_f32(y, z); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); - y = vaddq_f32(y, tmp); - - tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); - y = vsubq_f32(y, tmp); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); - x = vaddq_f32(x, y); - x = vaddq_f32(x, tmp); - x = vreinterpretq_f32_u32(vorrq_u32( - vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN - return x; -} - -static const float c_exp_hi = 88.3762626647949f; -static const float c_exp_lo = -88.3762626647949f; - -static const float c_cephes_LOG2EF = 1.44269504088896341; -static const float c_cephes_exp_C1 = 0.693359375; -static const float c_cephes_exp_C2 = -2.12194440e-4; - -static const float c_cephes_exp_p0 = 1.9875691500E-4; -static const float c_cephes_exp_p1 = 1.3981999507E-3; -static const float c_cephes_exp_p2 = 8.3334519073E-3; -static const float c_cephes_exp_p3 = 4.1665795894E-2; -static const float c_cephes_exp_p4 = 1.6666665459E-1; -static const float c_cephes_exp_p5 = 5.0000001201E-1; - -/* exp() computed for 4 float at once */ -static inline float32x4_t exp_ps(float32x4_t x) { - float32x4_t tmp, fx; - - float32x4_t one = vdupq_n_f32(1); - x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); - x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - uint32x4_t mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); - float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, - c_cephes_exp_p2, c_cephes_exp_p3, - c_cephes_exp_p4, c_cephes_exp_p5}; - float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0); - float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1); - float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2); - float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3); - float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4); - float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5); - - y = vmulq_f32(y, x); - z = vmulq_f32(x, x); - - y = vaddq_f32(y, c1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, one); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); - mm = vshlq_n_s32(mm, 23); - float32x4_t pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; -} - -static const float c_minus_cephes_DP1 = -0.78515625; -static const float c_minus_cephes_DP2 = -2.4187564849853515625e-4; -static const float c_minus_cephes_DP3 = -3.77489497744594108e-8; -static const float c_sincof_p0 = -1.9515295891E-4; -static const float c_sincof_p1 = 8.3321608736E-3; -static const float c_sincof_p2 = -1.6666654611E-1; -static const float c_coscof_p0 = 2.443315711809948E-005; -static const float c_coscof_p1 = -1.388731625493765E-003; -static const float c_coscof_p2 = 4.166664568298827E-002; -static const float c_cephes_FOPI = 1.27323954473516; // 4 / M_PI - -/* evaluation of 4 sines & cosines at once. - * - * The code is the exact rewriting of the cephes sinf function. - * Precision is excellent as long as x < 8192 (I did not bother to - * take into account the special handling they have for greater values - * -- it does not return garbage for arguments over 8192, though, but - * the extra precision is missing). - * - * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - * surprising but correct result. - * - * Note also that when you compute sin(x), cos(x) is available at - * almost no extra price so both sin_ps and cos_ps make use of - * sincos_ps.. - */ -static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, - float32x4_t *ycos) { - // any x - float32x4_t xmm1, xmm2, xmm3, y; - - uint32x4_t emm2; - - uint32x4_t sign_mask_sin, sign_mask_cos; - sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); - x = vabsq_f32(x); - - /* scale by 4/Pi */ - y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); - - /* store the integer part of y in mm0 */ - emm2 = vcvtq_u32_f32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); - emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); - y = vcvtq_f32_u32(emm2); - - /* get the polynom selection mask - * there is one polynom for 0 <= x <= Pi/4 - * and another one for Pi/4setAligned(false); - return true; - } - - bool dispatch() { - Tensor* input = param_.input; - Tensor* output = param_.output; - Tensor* src_tensor = input; - Tensor float_tensor; - input->invalidate(); - float_tensor.mutableData(DataType::FP32, input->shape()); - if (input->dataType() == DataType::FP16) { - float_tensor.copyFrom(input); - src_tensor = &float_tensor; - } - src_tensor->unalignImage(output, true); - return true; - } - - OutputParam& param() { return param_; } - - private: - OutputParam param_; -}; -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/pooling_pe.hpp b/mobile/src/fpga/KD/pes/pooling_pe.hpp deleted file mode 100644 index 421f30cd33..0000000000 --- a/mobile/src/fpga/KD/pes/pooling_pe.hpp +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "../pe.hpp" -#include "../pe_params.hpp" -namespace paddle_mobile { -namespace zynqmp { - -class PoolingPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - return true; - } - - void apply() { - Tensor* input = param_.input; - Tensor* output = param_.output; - - uint32_t k_width = param_.kernelSize[0]; - uint32_t k_height = param_.kernelSize[1]; - - if (param_.globalPooling) { - k_width = input->shape().width(); - k_height = input->shape().height(); - } - - PoolingArgs args = {0}; - args.mode = param_.type; - args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height)); - args.image.address = input->data(); - args.image.channels = input->shape().channel(); - args.image.height = input->shape().height(); - args.image.width = input->shape().width(); - args.image.pad_height = param_.paddings[0]; - args.image.pad_width = param_.paddings[1]; - args.image.scale_address = input->scale(); - args.output.address = output->mutableData(); - args.output.scale_address = output->scale(); - args.kernel.height = k_height; - args.kernel.width = k_width; - args.kernel.stride_h = param_.strides[0]; - args.kernel.stride_w = param_.strides[1]; - args.out_height = output->shape().height(); - args.out_width = output->shape().width(); - param_.poolingArgs = args; - } - - bool dispatch() { return compute_fpga_pool(param_.poolingArgs) == 0; } - - PoolingParam& param() { return param_; } - - private: - PoolingParam param_; -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/softmax_pe.cpp b/mobile/src/fpga/KD/pes/softmax_pe.cpp deleted file mode 100644 index f4596d3aa7..0000000000 --- a/mobile/src/fpga/KD/pes/softmax_pe.cpp +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "softmax_pe.hpp" - -#include - -namespace paddle_mobile { -namespace zynqmp { - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#ifndef __aarch64__ -static inline float32_t vmaxvq_f32(const float32x4_t &r) { - float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r)); - return vget_lane_f32(vpmax_f32(v, v), 0); -} - -static inline float32_t vaddvq_f32(const float32x4_t &r) { - float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r)); - return vget_lane_f32(vpadd_f32(v, v), 0); -} -#endif // __aarch64__ -#endif // __ARM_NEON__ - -static float find_max(const float *input, const int num_classes) { - int remain = num_classes; - float max = -std::numeric_limits::max(); -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - int loop = num_classes >> 3; - remain = num_classes & 0x7; - float32x4_t __max = vdupq_n_f32(max); - for (int i = 0; i < loop; ++i, input += 8) { - float32x4_t x0 = vld1q_f32(input); - float32x4_t x1 = vld1q_f32(input + 4); - __max = vmaxq_f32(x0, __max); - __max = vmaxq_f32(x1, __max); - } - max = vmaxvq_f32(__max); -#endif - for (int i = 0; i < remain; ++i) { - max = std::max(max, input[i]); - } - return max; -} - -static void softmax(Tensor *X, Tensor *Y) { - std::vector dims = X->shape().dims(); - int batch_size = X->shape().num(); - int num_classes = dims[X->shape().dimSize() - 1]; - int channels = X->shape().numel() / batch_size / num_classes; - float *x = X->data(); - float *y = Y->mutableData(); - -#pragma omp parallel for collapse(2) - for (int batch = 0; batch < batch_size; ++batch) { - for (int channel = 0; channel < channels; ++channel) { - size_t offset = (batch * channels + channel) * num_classes; - const float *input = x + offset; - float *output = y + offset; - // find max - float max = find_max(input, num_classes); - - // exp(x - max) - int remain = num_classes; -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - int loop = num_classes >> 3; - remain = num_classes & 0x7; - float32x4_t __max = vdupq_n_f32(max); - for (int i = 0; i < loop; ++i, input += 8, output += 8) { - float32x4_t x0 = vld1q_f32(input); - float32x4_t x1 = vld1q_f32(input + 4); - x0 = vsubq_f32(x0, __max); - x1 = vsubq_f32(x1, __max); - x0 = exp_ps(x0); - x1 = exp_ps(x1); - vst1q_f32(output, x0); - vst1q_f32(output + 4, x1); - } -#endif // __ARM_NEON__ - for (int i = 0; i < remain; ++i) { - output[i] = expf(input[i] - max); - } - - // sum(exp(x - max)) - float sum = 0.f; - output = y + offset; -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - float32x4_t __sum = vdupq_n_f32(0.f); - for (int i = 0; i < loop; ++i, output += 8) { - float32x4_t x0 = vld1q_f32(output); - float32x4_t x1 = vld1q_f32(output + 4); - __sum = vaddq_f32(x0, __sum); - __sum = vaddq_f32(x1, __sum); - } - sum += vaddvq_f32(__sum); -#endif // __ARM_NEON__ - for (int i = 0; i < remain; ++i) { - sum += output[i]; - } - - // exp(x - max) / sum - float inv_sum = 1.f / sum; - output = y + offset; -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - float32x4_t __inv_sum = vdupq_n_f32(inv_sum); - for (int i = 0; i < loop; ++i, output += 8) { - float32x4_t x0 = vld1q_f32(output); - float32x4_t x1 = vld1q_f32(output + 4); - x0 = vmulq_f32(x0, __inv_sum); - x1 = vmulq_f32(x1, __inv_sum); - vst1q_f32(output, x0); - vst1q_f32(output + 4, x1); - } -#endif - for (int i = 0; i < remain; ++i) { - output[i] *= inv_sum; - } - } - } -} - -bool SoftmaxPE::init() { - Tensor *output = param_.output; - output->setAligned(false); - return true; -} - -bool SoftmaxPE::dispatch() { - Tensor *input = param_.input; - Tensor *output = param_.output; - input->invalidate(); - - Tensor float_input; - Tensor float_output; - float_input.mutableData(DataType::FP32, input->shape()); - float_input.copyFrom(input); - float_input.unalignImage(); - - float *out_data = - float_output.mutableData(DataType::FP32, input->shape()); - - softmax(&float_input, &float_output); - float_output.flush(); - - output->copyFrom(&float_output); - return true; -} - -SoftmaxParam &SoftmaxPE::param() { return param_; } -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/pes/softmax_pe.hpp b/mobile/src/fpga/KD/pes/softmax_pe.hpp deleted file mode 100644 index 42b4014616..0000000000 --- a/mobile/src/fpga/KD/pes/softmax_pe.hpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#include -#include "fpga/KD/pes/math_func_neon.h" -#endif - -#include "../pe.hpp" -#include "../pe_params.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -class SoftmaxPE : public PE { - public: - bool init(); - bool dispatch(); - - SoftmaxParam& param(); - - private: - SoftmaxParam param_; -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/shape.hpp b/mobile/src/fpga/KD/shape.hpp deleted file mode 100644 index 587df10310..0000000000 --- a/mobile/src/fpga/KD/shape.hpp +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "fpga/KD/alignment.h" -#include "fpga/KD/layout.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -static struct NCHW nchw_; -static struct NHWC nhwc_; -static struct NC nc_; -static struct NHW nhw_; -static struct N n_; - -class Shape { - public: - explicit Shape(std::vector dims) { dims_ = dims; } - - Shape(LayoutType type, std::vector dims) { - dims_ = dims; - setLayoutType(type); - } - - Shape(const Shape& src) { - dims_ = src.dims_; - setLayoutType(src.layoutType_); - } - - bool shouldAlign() { - return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_); - } - - int num() { - int index = layout_->numIndex(); - return index == -1 ? 1 : dims_[index]; - } - - int channel() { - int index = layout_->channelIndex(); - return index == -1 ? 1 : dims_[index]; - } - - int height() { - int index = layout_->heightIndex(); - return index == -1 ? 1 : dims_[index]; - } - - int width() { - int index = layout_->widthIndex(); - return index == -1 ? 1 : dims_[index]; - } - - int dimSize() { return dims_.size(); } - - std::vector dims() { return dims_; } - - size_t memorySize(int cellSize) { - return layout_->alignedElementCount(dims_) * cellSize; - } - - int numel() { return layout_->elementCount(dims_); } - - void setLayoutType(LayoutType layout) { - this->layoutType_ = layout; - switch (layout) { - case NCHW: - layout_ = &nchw_; - break; - case NHWC: - layout_ = &nhwc_; - break; - case NC: - layout_ = &nc_; - break; - case NHW: - layout_ = &nhw_; - break; - case N: - layout_ = &n_; - break; - default: - break; - } - } - - int operator[](int index) { return dims_[index]; } - - private: - LayoutType layoutType_; - Layout* layout_ = &nhwc_; - std::vector dims_; -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/tensor.hpp b/mobile/src/fpga/KD/tensor.hpp deleted file mode 100644 index 496d6f7792..0000000000 --- a/mobile/src/fpga/KD/tensor.hpp +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "float16.hpp" -#include "llapi/zynqmp_api.h" -#include "shape.hpp" - -namespace paddle_mobile { -namespace zynqmp { - -enum DataType : int { - FP32 = 0, - FP16 = 1, - INT8 = 2, -}; - -typedef uint16_t float16; - -inline int CellSize(DataType type) { - switch (type) { - case FP32: - return sizeof(float); - case FP16: - return sizeof(float16); - case INT8: - return sizeof(int8_t); - default: - return 0; - } - return 0; -} - -class PlaceHolder { - public: - explicit PlaceHolder(size_t size) { - size_ = size; - data_ = fpga_malloc(size_); - } - - void* data() { return data_; } - - size_t memorySize() { return size_; } - - ~PlaceHolder() { - std::cout << "place holder dealloc"; - fpga_free(data_); - } - - private: - void* data_ = nullptr; - size_t size_ = 0; -}; - -class Tensor { - public: - int id() { return id_; } - - template - Dtype* data() { - if (placeHolder_ == nullptr) { - return nullptr; - } - return reinterpret_cast(this->placeHolder_->data()); - } - - template - Dtype* mutableData(DataType dataType, const Shape& shape) { - // if (this->shape_ != &shape) { - if (this->shape_ != nullptr) { - delete shape_; - } - this->shape_ = new Shape(shape); - // } - this->dataType_ = dataType; - return mutableData(); - } - - template - Dtype* mutableData() { - size_t memorySize = shape_->memorySize(CellSize(dataType_)); - if (placeHolder_ != nullptr) { - if (memorySize > placeHolder_->memorySize()) { - delete placeHolder_; - placeHolder_ = new PlaceHolder(memorySize); - } - } else { - placeHolder_ = new PlaceHolder(memorySize); - } - return reinterpret_cast(placeHolder_->data()); - } - - void setDataType(DataType dataType) { this->dataType_ = dataType; } - - DataType dataType() { return this->dataType_; } - - Shape& shape() { return *shape_; } - - bool aligned() { return this->aligned_; } - - void setAligned(bool aligned) { this->aligned_ = aligned; } - - float* scale() { return scale_; } - - void alignImage(Tensor* dst = nullptr, bool copy = false) { - if (shape_->shouldAlign()) { - int cell_size = CellSize(this->dataType_); - char* dst_data = nullptr; - size_t mem_size = shape_->memorySize(cell_size); - if (dst == nullptr) { - dst_data = reinterpret_cast(fpga_malloc(mem_size)); - } else { - dst_data = dst->data(); - } - int wc = shape_->width() * shape_->channel(); - int wc_aligned = align_image(wc); - int remainder = wc_aligned - wc; - - char* src_start = data(); - char* dst_start = dst_data; - for (int n = 0; n < shape_->num(); n++) { - for (int h = 0; h < shape_->height(); h++) { - memcpy(dst_start, src_start, wc * cell_size); - memcpy(dst_start + wc * cell_size, 0, remainder * cell_size); - src_start += wc * cell_size; - dst_start += wc_aligned * cell_size; - } - } - if (dst == nullptr) { - memcpy(data(), dst_data, mem_size); - flush(); - fpga_free(dst_data); - } else { - dst->flush(); - } - } else { - if (copy) { - dst->copyFrom(this); - } else { - // TODO(chonwhite) share data. - } - } - } - - void unalignImage(Tensor* dst = nullptr, bool copy = false) { - if (shape_->shouldAlign()) { - // int cell_size = CellSize(this->dataType_); - // char* dst_data = nullptr; - // size_t mem_size = shape_->memorySize(cell_size); - // if (dst == nullptr) { - // dst_data = (char*)fpga_malloc(mem_size); - // } else { - // dst_data = dst->data(); - // } - // int wc = shape_->width() * shape_->channel(); - // int wc_aligned = align_image(wc); - // int remainder = wc_aligned - wc; - - // char* src_start = data(); - // char* dst_start = dst_data; - // for (int n = 0; n < shape_->num(); n++) { - // for (int h = 0;h < shape_->height(); h++) { - // memcpy(dst_start, src_start, wc * cell_size); - // memcpy(dst_start + wc * cell_size, 0, remainder * cell_size); - // src_start += wc * cell_size; - // dst_start += wc_aligned * cell_size; - // } - // } - // if (dst == nullptr) { - // memcpy(data(), dst_data, mem_size); - // flush(); - // fpga_free(dst_data); - // } else { - // dst->flush(); - // } - } else { - if (copy) { - dst->copyFrom(this); - } else { - // TODO(chonwhite) share data. - } - } - } - - void copyFrom(Tensor* src) { - BypassArgs args; - args.input_data_type = - src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; - args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; - args.input_layout_type = LAYOUT_HWC; - args.output_layout_type = LAYOUT_HWC; - args.image = {.address = src->data(), - .scale_address = src->scale(), - .channels = (uint32_t)src->shape().channel(), - .width = (uint32_t)src->shape().width(), - .height = (uint32_t)src->shape().height(), - .pad_width = 0u, - .pad_height = 0u}; - args.output = { - .address = data(), - .scale_address = scale(), - }; - src->flush(); - perform_bypass(args); - this->invalidate(); - } - - void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); } - - void invalidate() { - fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize()); - } - - void print() { - int count = shape_->numel(); - for (int i = 0; i < count; i++) { - std::cout << "" << '\n'; - } - } - - void saveToFile() { - std::string path = std::to_string(id_) + ".txt"; - saveToFile(path); - } - - void saveToFile(std::string path) { - std::ofstream ofs; - static int counter = 0; - std::string npath = std::to_string(counter) + "_" + path; - counter++; - ofs.open(npath); - for (size_t i = 0; i < shape_->numel(); i++) { - float value = 0; - if (dataType_ == FP32) { - value = data()[i]; - } else { - value = half_to_float(data()[i]); - } - ofs << value << std::endl; - } - ofs.close(); - } - - private: - float scale_[2]; - Shape* shape_ = nullptr; - DataType dataType_ = FP32; - bool aligned_ = false; - - static int generateID() { - static int sID = 0; - int id = sID++; - return id; - } - - int id_ = generateID(); - - PlaceHolder* placeHolder_ = nullptr; -}; - -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/tensor_util.cpp b/mobile/src/fpga/KD/tensor_util.cpp deleted file mode 100644 index 29b6595788..0000000000 --- a/mobile/src/fpga/KD/tensor_util.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "tensor_util.hpp" - -namespace paddle_mobile { -namespace zynqmp { -float find_max(const Tensor& tensor) { - float max = 0; - Tensor& t = const_cast(tensor); - float* data = t.data(); - for (int i = 0; i < t.shape().numel(); i++) { - max = std::max(data[i], max); - } - return max; -} -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/KD/tensor_util.hpp b/mobile/src/fpga/KD/tensor_util.hpp deleted file mode 100644 index 81d86f22f7..0000000000 --- a/mobile/src/fpga/KD/tensor_util.hpp +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "tensor.hpp" - -namespace paddle_mobile { -namespace zynqmp { -float find_max(const Tensor& tensor); -} // namespace zynqmp -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/api.cpp b/mobile/src/fpga/V1/api.cpp deleted file mode 100644 index dc5163d2b2..0000000000 --- a/mobile/src/fpga/V1/api.cpp +++ /dev/null @@ -1,1021 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V1/api.h" -#include -#include "fpga/V1/bias_scale.h" -#include "fpga/V1/deconv_filter.h" -#include "fpga/V1/filter.h" -#include "fpga/V1/image.h" - -namespace paddle_mobile { -namespace fpga { - -#define USE_RELU 1 -#define USE_BIAS 2 - -void format_image(framework::Tensor *image_tensor) { - auto dims = image_tensor->dims(); - auto channel = dims[1], height = dims[2], width = dims[3]; - kTypeId_t input_type = image_tensor->type(); - if (input_type == type_id()) { - auto data_ptr = image_tensor->data(); - auto external_ptr = reinterpret_cast(image_tensor->external_data); - float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - - image::format_image(&p_data, channel, height, width); - if (p_data != data_ptr && external_ptr == nullptr) { - image_tensor->reset_data_ptr(p_data); - } - } else { - auto data_ptr = image_tensor->data(); - auto external_ptr = reinterpret_cast(image_tensor->external_data); - int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - - image::format_image(&p_data, channel, height, width); - if (p_data != data_ptr && external_ptr == nullptr) { - image_tensor->reset_data_ptr(p_data); - } - } -} - -void format_ofm(framework::Tensor *ofm_tensor) { - if (ofm_tensor->type() == type_id()) { - format_fp32_ofm(ofm_tensor); - } else { - format_fp16_ofm(ofm_tensor); - } -} -void format_fp16_ofm(framework::Tensor *ofm_tensor) { - auto dims = ofm_tensor->dims(); - size_t memory_size = 0; - if (dims.size() == 4) { - auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0]; - memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * - sizeof(half); - } else if (dims.size() == 2) { - memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); - } else { - DLOG << "Wrong ofm dimension"; - } - auto p = fpga_malloc(memory_size); - // memset(p, 0, memory_size); - ofm_tensor->reset_data_ptr(p); - ofm_tensor->set_type(type_id().hash_code()); - ofm_tensor->fpga_data_num = memory_size / sizeof(half); - fpga::fpga_flush(p, memory_size); -} - -void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { - // auto dims = ofm_tensor->dims(); - size_t memory_size = 0; - if (dims.size() == 4) { - auto channel = dims[1], height = dims[2], width = dims[3]; - memory_size = - height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); - } else if (dims.size() == 2) { - memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); - } else { - DLOG << "Wrong ofm dimension"; - } - auto p = fpga_malloc(memory_size); - // memset(p, 0, memory_size); - ofm_tensor->reset_data_ptr(p); - ofm_tensor->set_type(type_id().hash_code()); - ofm_tensor->fpga_data_num = memory_size / sizeof(half); - fpga::fpga_flush(p, memory_size); -} - -void format_fp32_ofm(framework::Tensor *ofm_tensor) { - auto dims = ofm_tensor->dims(); - size_t memory_size = 0; - if (dims.size() == 4) { - auto channel = dims[1], height = dims[2], width = dims[3]; - memory_size = - height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float); - } else if (dims.size() == 2) { - memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float); - } else { - DLOG << "Wrong ofm dimension"; - } - auto p = fpga_malloc(memory_size); - // memset(p, 0, memory_size); - ofm_tensor->reset_data_ptr(p); - ofm_tensor->set_type(type_id().hash_code()); - ofm_tensor->fpga_data_num = memory_size / sizeof(float); - fpga::fpga_flush(p, memory_size); -} - -float filter_find_max(framework::Tensor *filter_tensor) { - auto filter_ptr = filter_tensor->data(); - return filter::find_max(filter_ptr, filter_tensor->numel()); -} - -int get_plit_num(framework::Tensor *filter_tensor) { - auto dims = filter_tensor->dims(); - auto chw = dims[1] * dims[2] * dims[3]; - auto num = dims[0]; - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_split_num(num, div_capacity); -} -int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) { - auto dims = filter_tensor->dims(); - auto chw = dims[1] * dims[2] / stride * dims[3] / stride; - auto num = dims[0] * stride; - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_split_num(num, div_capacity); -} - -int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) { - auto dims = filter_tensor->dims(); - auto chw = dims[1] * dims[2] * dims[3]; - auto num = dims[0]; - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_num_per_div(num, group_num, div_capacity); -} - -int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor, - int group_num, int stride) { - auto dims = filter_tensor->dims(); - auto chw = dims[1] * dims[2] / stride * dims[3] / stride; - auto num = dims[0] * stride; - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_num_per_div(num, group_num, div_capacity); -} - -int get_aligned_filter_element_num(int chw) { - return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); -} - -void format_filter(framework::Tensor *filter_tensor, float max_value, - int group_num) { - filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT - filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT - auto dims = filter_tensor->dims(); - auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * channel * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - filter::format_filter(&new_data, num, channel, height, width, group_num, - max_value); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} -void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { - auto dims = filter_tensor->dims(); - auto num = dims[0], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} - -void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, - int stride) { - auto dims = filter_tensor->dims(); - auto num = dims[0], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - - int hw = height * width; - deconv_filter::deconv_NC_convert(&new_data, num, 1, hw); - - num = dims[1]; - int channel = dims[0]; - - deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width, - scale_ptr, stride); - - // framework::DDim dims_new = - // framework::make_ddim({num, 1, height, width}); - // filter_tensor->Resize(dims_new); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} - -void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { - filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT - filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT - auto dims = filter_tensor->dims(); - auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * channel * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - filter::format_fc_filter(&new_data, num, channel, height, width, 1, - max_value); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} -void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, - int group_num, int stride) { - filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT - filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT - auto dims = filter_tensor->dims(); - auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * channel * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - memcpy(new_data, data_ptr, memory_size); - - int hw = height * width; - deconv_filter::deconv_NC_convert(&new_data, num, channel, hw); - - num = dims[1]; - channel = dims[0]; - deconv_filter::deconv_format_filter( - &new_data, (int)num, (int)channel, // NOLINT - (int)height, // NOLINT - (int)width, group_num, max_value, stride); // NOLINT - - framework::DDim dims_new = - framework::make_ddim({num, channel, height, width}); - filter_tensor->Resize(dims_new); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} - -void format_bias_scale_array(float **bias_scale_array, - int element_num_per_division, int num) { - bias_scale::format_bias_scale_array(bias_scale_array, - element_num_per_division, num); -} -void format_bias_array(float **bias_array, int num) { - bias_scale::format_bias_array(bias_array, num); -} - -void format_concat_output(framework::Tensor *out, int height, int width, - int image_num, uint32_t *channel_num) { - int sum_channel = 0, sum_cw = 0; - for (int i = 0; i < image_num; i++) { - sum_channel += channel_num[i]; - } - - sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT); - auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half)); - auto ddim = framework::make_ddim({1, sum_channel, height, width}); - out->Resize(ddim); - out->reset_data_ptr(data_ptr); - out->fpga_data_num = sum_cw * height; - out->set_type(type_id().hash_code()); -} -void format_conv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float **bs_ptr, - int group) { - float max_value = fpga::filter_find_max(filter_tensor); - fpga::format_filter(filter_tensor, max_value, group); - int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group); - fpga::format_bias_scale_array(bs_ptr, element_num_per_div, - ofm_tensor->dims()[1]); - fpga::format_fp16_ofm(ofm_tensor); -} -void format_deconv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float **bs_ptr, - int group, int sub_conv_n) { - int channel = ofm_tensor->dims()[1]; - float max_value = filter_find_max(filter_tensor); - format_deconv_filter(filter_tensor, max_value, group, sub_conv_n); - int element_num_per_div = - get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n); - format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n); - format_fp16_ofm(ofm_tensor); -} - -void format_dwconv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float *scale_ptr, - float **bias_ptr) { - auto channel = ofm_tensor->dims()[1]; - format_dwconv_filter(filter_tensor, scale_ptr); - format_bias_array(bias_ptr, channel); - format_fp16_ofm(ofm_tensor); -} -void format_DWDeconv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float **bs_ptr, - int group, int sub_conv_n) { - int channel = ofm_tensor->dims()[1]; - // dw-deconv - format_DWDconv_filter( - filter_tensor, - (reinterpret_cast(*bs_ptr) + sub_conv_n * channel), sub_conv_n); - format_bias_array(bs_ptr, channel); - format_fp16_ofm(ofm_tensor); -} -void expand_conv_arg(ConvArgs *arg) { - ConvArgs args = *arg; - - auto fpga_bias_scale_len = - align_to_x(args.filter_num / args.group_num, 8) * args.group_num; - - auto output_height = - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1; - auto output_width = - (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1; - - auto filter_per_group = args.filter_num / args.group_num; - auto channel_per_group = args.image.channels / args.group_num; - - auto image_row_count = args.image.width * args.image.channels; - auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT); - auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) + - args.image.pad_width * args.image.channels; - auto filter_amount_all = - align_to_x(args.kernel.height * args.kernel.width * channel_per_group, - FILTER_ELEMENT_ALIGNMENT); - - auto output_amount_per_row = align_to_x( - (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num, - IMAGE_ALIGNMENT); - - // find the opt partition strategy - uint64_t res_win; - uint64_t res_fit = 0; - for (res_win = 1; res_win <= output_width; res_win++) { - if ((align_to_x( - (args.image.channels * - (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), - IMAGE_ALIGNMENT) / - 16 + - 1) * - args.kernel.height > - 2048) { - break; - } - } - - if (res_win != output_width) { - res_win -= 1; - } - - if (((res_win % 2) != 0) && (res_win != 1)) { - res_win = res_win - 1; - } - res_fit = res_win; - - auto block_num = (output_width + res_fit - 1) / res_fit; - auto block_len = res_fit; - auto block_last = output_width - res_fit * (block_num - 1); - - auto res_amount_per_row = - (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num; - auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; - - auto image_block_amount_per_row = - args.kernel.stride_w * res_fit * args.image.channels; - auto filter_pad_width_mul_channel = - args.image.pad_width * args.image.channels; - auto image_amount_per_row_multi_win_first = - image_amount_per_row * - (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height); - auto image_amount_per_row_multi_win = - image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h); - - auto image_block_num = block_num; - auto image_block_len = - align_to_x((args.image.channels * - (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), - IMAGE_ALIGNMENT) / - 16 + - 1; - auto image_block_len_last = - align_to_x( - (args.image.channels * - (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), - IMAGE_ALIGNMENT) / - 16 + - 1; - auto image_win_cnt = block_len; - auto image_win_cnt_last = block_last; - auto res_row_data_align4_pad = res_amount_per_row_pad / 8; - auto prog_full_cnt = 1024 / (filter_amount_all / 16 * 2) - 1; - if (prog_full_cnt == 511) { - prog_full_cnt--; - } - auto post_prog_full_cnt = - (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) - ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) - : 0; - // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; - auto cmd = 0UL | USE_BIAS; - - auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) | - ((args.deconv_tx_param.sub_conv_num) << 8) | - ((args.deconv_tx_param.omit_size) << 0); - (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); - (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); - (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address); - (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) + - args.deconv_tx_param.out_addr_offset; - (*arg).driver.output_height = output_height; - (*arg).driver.output_width = output_width; - (*arg).driver.filter_per_group = filter_per_group; - (*arg).driver.channel_per_group = channel_per_group; - (*arg).driver.image_amount_per_row = image_amount_per_row; - (*arg).driver.image_one_pad_per_row = image_one_pad_per_row; - (*arg).driver.filter_amount_all = filter_amount_all; - (*arg).driver.output_amount_per_row = output_amount_per_row; - (*arg).driver.image_block_amount_per_row = image_block_amount_per_row; - (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel; - (*arg).driver.image_amount_per_row_multi_win_first = - image_amount_per_row_multi_win_first; - (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win; - (*arg).driver.image_block_num = image_block_num; - (*arg).driver.image_block_len = image_block_len; - (*arg).driver.image_block_len_last = image_block_len_last; - (*arg).driver.image_win_cnt = image_win_cnt; - (*arg).driver.image_win_cnt_last = image_win_cnt_last; - (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad; - (*arg).driver.prog_full_cnt = prog_full_cnt; - (*arg).driver.post_prog_full_cnt = post_prog_full_cnt; - (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len; - (*arg).driver.cmd = cmd; - (*arg).driver.deconv_param = deconv_param; -} // expand_conv_arg() - -void expand_EW_arg(EWAddArgs *arg) { - EWAddArgs args = *arg; - // uint64_t cmd = args.relu_enabled ? USE_RELU : 0; - uint64_t cmd = 0; - uint64_t datalen = (uint64_t)args.image0.width * - (uint64_t)args.image0.height * - (uint64_t)args.image0.channels; - uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; - uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address); - uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address); - uint64_t output_address_phy = vaddr_to_paddr(args.output.address); - - uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, - IMAGE_ALIGNMENT); - uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | - ((uint64_t)args.image0.width << 16) | - (uint64_t)args.image0.height; - - (*arg).driver.image0_address_phy = image0_address_phy; - (*arg).driver.image1_address_phy = image1_address_phy; - (*arg).driver.datalen = datalen; - (*arg).driver.image_image_pixel = image_image_pixel; - (*arg).driver.image_amount_per_row = image_amount_per_row; - (*arg).driver.output_address_phy = output_address_phy; - (*arg).driver.coefficient = coefficient; - (*arg).driver.cmd = cmd; -} // expand_EW_arg - -void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int group_num, - int stride_h, int stride_w, int padding_h, int padding_w, - float *bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); - auto out_ptr = out->data(); - auto deleter = [](void *p) { fpga_free(p); }; - - arg->group_num = (uint32_t)group_num; - // Either group_num or split_num = 1; - arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1; - arg->filter_num = (uint32_t)filter->dims()[0]; - arg->output.address = out_ptr; - arg->output.scale_address = out->scale; - arg->conv_arg = - (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT - - arg->shared_conv_arg = std::shared_ptr(arg->conv_arg, deleter); - - memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs)); - - arg->concat_arg.image_num = arg->split_num; - arg->concat_arg.image_out = out_ptr; - arg->concat_arg.scale_out = out->scale; - arg->concat_arg.height = (uint32_t)out->dims()[2]; - arg->concat_arg.width = (uint32_t)out->dims()[3]; - - int n = arg->split_num; - arg->concat_arg.images_in = - static_cast(fpga_malloc(n * sizeof(int *))); - arg->concat_arg.scales_in = - static_cast(fpga_malloc(n * sizeof(float *))); - arg->concat_arg.channel_num = - static_cast(fpga_malloc(n * sizeof(uint32_t))); - arg->vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(arg->concat_arg.images_in), deleter)); - arg->vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(arg->concat_arg.scales_in), deleter)); - arg->vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(arg->concat_arg.channel_num), deleter)); - - auto channel = (int)out->dims()[1]; // NOLINT - int filter_num_per_div = get_filter_num_per_div(filter, group_num); - int element_num = get_aligned_filter_element_num( - (int)(filter->dims()[1] * filter->dims()[2] * // NOLINT - filter->dims()[3])); - - for (int i = 0; i < n; i++) { - // arg->conv_arg[i].relu_enabled = relu_enabled; - arg->conv_arg[i].output.activation.activation_type = activation_enable; - arg->conv_arg[i].output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; - arg->conv_arg[i].group_num = (uint32_t)group_num; - arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; - arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; - arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2]; - arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3]; - arg->conv_arg[i].image.address = input_ptr; - arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1]; - arg->conv_arg[i].image.height = (uint32_t)input->dims()[2]; - arg->conv_arg[i].image.width = (uint32_t)input->dims()[3]; - arg->conv_arg[i].image.scale_address = input->scale; - arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; - arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; - arg->conv_arg[i].filter_scale_address = filter->scale; - arg->conv_arg[i].filter_num = (uint32_t)( - i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT - : filter_num_per_div); - - size_t filter_size = - element_num * - align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) * - sizeof(int8_t); - auto filter_head = &( - (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT - arg->conv_arg[i].filter_address = fpga_malloc(filter_size); - arg->vector_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->conv_arg[i].filter_address), deleter)); - memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); - fpga_flush(arg->conv_arg[i].filter_address, filter_size); - // for test - // { - // static int cnt = 0; - // if(cnt == 4){ - // int8_t result = 0; - // std::string str = "fc_filter"; - // fpga::savefile(str, arg->conv_arg[i].filter_address, - // filter_size, result); - // - // } - // cnt++; - //} - - size_t bs_size = 2 * - align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) * - sizeof(float); - auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; - arg->conv_arg[i].sb_address = fpga_malloc(bs_size); - arg->vector_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->conv_arg[i].sb_address), deleter)); - memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); - fpga_flush(arg->conv_arg[i].sb_address, bs_size); - // for test - /*{ - static int cnt = 0; - if(cnt == 4){ - float result = 0; - std::string str = "fc_bs"; - fpga::savefile(str, arg->conv_arg[i].sb_address, bs_size/4, -result); - - } - cnt++; -}*/ - - if (n > 1) { - arg->conv_arg[i].output.scale_address = - static_cast(fpga_malloc(2 * sizeof(float))); - arg->conv_arg[i].output.address = - fpga_malloc(out->dims()[2] * - align_to_x((int)(out->dims()[3] * // NOLINT - arg->conv_arg[i].filter_num), - IMAGE_ALIGNMENT) * - sizeof(half)); - arg->vector_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->conv_arg[i].output.scale_address), - deleter)); - arg->vector_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->conv_arg[i].output.address), deleter)); - } else { - arg->conv_arg[i].output.scale_address = out->scale; - arg->conv_arg[i].output.address = out_ptr; - } - - arg->concat_arg.images_in[i] = - (half *)arg->conv_arg[i].output.address; // NOLINT - arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; - arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; - - expand_conv_arg(&arg->conv_arg[i]); - } - filter->reset_data_ptr(nullptr); - fpga_free(bs_ptr); -} // fill_split_arg - -void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int group_num, - int stride_h, int stride_w, int padding_h, int padding_w, - float *bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); - auto deleter = [](void *p) { fpga_free(p); }; - - arg->group_num = (uint32_t)group_num; - arg->sub_conv_num = (uint32_t)stride_h; - arg->filter_num = (uint32_t)filter->dims()[0]; - uint32_t sub_conv_num = arg->sub_conv_num; - int sub_pad = - deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], // NOLINT - padding_w, stride_w); - auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis( - (int)filter->dims()[3], stride_w); // NOLINT - - auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[3], sub_pad, sub_filter_width); // NOLINT - auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[2], sub_pad, sub_filter_width); // NOLINT - - arg->sub_output_width = (uint32_t)sub_output_width; - arg->sub_output_height = (uint32_t)sub_output_height; - arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( - stride_w, (int)filter->dims()[3], padding_w); // NOLINT - - auto sub_channels = (int)input->dims()[1]; // NOLINT - uint32_t omit_size = arg->omit_size; - int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; - int sub_filter_num = sub_conv_num * (arg->filter_num); - - framework::DDim dims_out_new = framework::make_ddim( - {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width}); - fpga::format_fp16_ofm(out, dims_out_new); - auto out_ptr = out->data(); - arg->output.address = - (half *)out_ptr + // NOLINT - omit_size * sizeof(half) * - (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); - arg->output.scale_address = out->scale; - - uint32_t conv_output_size = - (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) * - sub_output_height; - uint32_t split_num = - group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1; - - for (int i = 0; i < sub_conv_num; ++i) { - arg->split_conv_args.push_back(std::make_shared()); - arg->split_conv_args[i]->filter_num = - (arg->sub_conv_num) * (arg->filter_num); - arg->split_conv_args[i]->group_num = (uint32_t)group_num; - arg->split_conv_args[i]->split_num = split_num; - arg->split_conv_args[i]->concat_arg.height = sub_output_height; - arg->split_conv_args[i]->concat_arg.width = sub_output_width; - arg->split_conv_args[i]->concat_arg.image_num = split_num; - - arg->split_conv_args[i]->conv_arg = - static_cast(fpga_malloc(split_num * sizeof(ConvArgs))); - arg->split_conv_args[i]->concat_arg.images_in = - static_cast(fpga_malloc(split_num * sizeof(int16_t *))); - arg->split_conv_args[i]->concat_arg.scales_in = - static_cast(fpga_malloc(split_num * sizeof(float *))); - arg->split_conv_args[i]->concat_arg.channel_num = - static_cast(fpga_malloc(split_num * sizeof(uint32_t))); - arg->split_conv_args[i]->shared_conv_arg = - std::shared_ptr(arg->split_conv_args[i]->conv_arg, deleter); - arg->split_conv_args[i]->vector_concat_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->concat_arg.images_in), - deleter)); - arg->split_conv_args[i]->vector_concat_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->concat_arg.scales_in), - deleter)); - arg->split_conv_args[i]->vector_concat_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->concat_arg.channel_num), - deleter)); - } - - auto filter_num_per_div = - (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w); - int element_num = get_aligned_filter_element_num( - (int)(sub_channels * sub_filter_width * sub_filter_width)); // NOLINT - - int chw = sub_channels * sub_filter_width * sub_filter_width; - int division_capacity = filter::calc_division_capacity(chw); - int num_per_div_before_alignment = - filter::calc_num_per_div(sub_filter_num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = (sub_filter_num + num_per_div_before_alignment - 1) / - num_per_div_before_alignment; - int residual = sub_filter_num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - - int filter_sub_conv_offset = element_num * num_after_alignment; - uint32_t out_addr_offset = 0; - for (int i = 0; i < sub_conv_num; ++i) { - if (sub_conv_num == 1) { - arg->split_conv_args[i]->output.address = arg->output.address; - arg->split_conv_args[i]->output.scale_address = arg->output.scale_address; - out_addr_offset = 0; - - } else { - out_addr_offset = - sizeof(int16_t) * (sub_conv_num - 1 - i) * - (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); - - arg->split_conv_args[i]->output.address = out_ptr; - arg->split_conv_args[i]->output.scale_address = - static_cast(fpga_malloc(2 * sizeof(float))); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->output.scale_address), - deleter)); - } - - for (int j = 0; j < split_num; ++j) { - arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = - activation_enable; - arg->split_conv_args[i] - ->conv_arg[j] - .output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; - arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num; - - arg->split_conv_args[i]->conv_arg[j].kernel.width = - (uint32_t)sub_filter_width; - arg->split_conv_args[i]->conv_arg[j].kernel.height = - (uint32_t)sub_filter_width; - arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1; - arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1; - - arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1; - arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num = - sub_conv_num; - arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size = - omit_size; - arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset = - out_addr_offset; - - arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale; - arg->split_conv_args[i]->conv_arg[j].image.channels = - (uint32_t)sub_channels; - arg->split_conv_args[i]->conv_arg[j].image.width = - (uint32_t)input->dims()[3]; - arg->split_conv_args[i]->conv_arg[j].image.height = - (uint32_t)input->dims()[2]; - arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad; - arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad; - arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr; - - arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale; - arg->split_conv_args[i]->conv_arg[j].filter_num = - (uint32_t)(j == split_num - 1 - ? sub_filter_num - (split_num - 1) * filter_num_per_div - : filter_num_per_div); - - size_t filter_size = - element_num * - align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num, - FILTER_NUM_ALIGNMENT) * - sizeof(int8_t); - auto filter_head = &(( - int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT - i * filter_sub_conv_offset]; - arg->split_conv_args[i]->conv_arg[j].filter_address = - fpga_malloc(filter_size); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->conv_arg[j].filter_address), - deleter)); - - memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head, - filter_size); - fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address, - filter_size); - - size_t bs_align_num = align_to_x( - arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT); - size_t bs_size = 2 * bs_align_num * sizeof(float); - auto bs_head = &bs_ptr[j * filter_num_per_div * 2]; - - arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->conv_arg[j].sb_address), - deleter)); - - memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size); - fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size); - - if (split_num == 1) { - arg->split_conv_args[i]->conv_arg[j].output.address = - arg->split_conv_args[i]->output.address; - arg->split_conv_args[i]->conv_arg[j].output.scale_address = - arg->split_conv_args[i]->output.scale_address; - } else { - arg->split_conv_args[i]->conv_arg[j].output.address = - fpga_malloc(conv_output_size * sizeof(int16_t)); - arg->split_conv_args[i]->conv_arg[j].output.scale_address = - static_cast(fpga_malloc(2 * sizeof(float))); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->conv_arg[j].output.address), - deleter)); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->conv_arg[j].output.scale_address), - deleter)); - } - arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast( - arg->split_conv_args[i]->conv_arg[j].output.address); - arg->split_conv_args[i]->concat_arg.scales_in[j] = - arg->split_conv_args[i]->conv_arg[j].output.scale_address; - arg->split_conv_args[i]->concat_arg.channel_num[j] = - arg->split_conv_args[i]->conv_arg[j].filter_num; - - expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j])); - } - - arg->split_conv_args[i]->concat_arg.image_out = - arg->split_conv_args[i]->output.address; - arg->split_conv_args[i]->concat_arg.scale_out = - arg->split_conv_args[i]->output.scale_address; - } - filter->reset_data_ptr(nullptr); - fpga_free(bs_ptr); -} // fill_deconv_arg - -void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int stride_h, - int stride_w, int padding_h, int padding_w, - float *bias_ptr) { - auto deleter = [](void *p) { fpga_free(p); }; - arg->vector_dwconv_space.push_back( - std::shared_ptr(reinterpret_cast(bias_ptr), deleter)); - - auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - auto output_ptr = out->mutable_data(); - arg->sub_conv_num = 1; - // arg->relu_enabled = relu_enabled; - arg->output.activation.activation_type = activation_enable; - arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; - arg->bias_address = bias_ptr; - arg->filter_address = filter_ptr; - arg->kernel.height = (uint32_t)filter->dims()[2]; - arg->kernel.width = (uint32_t)filter->dims()[3]; - arg->kernel.stride_h = (uint32_t)stride_h; - arg->kernel.stride_w = (uint32_t)stride_w; - arg->image.address = input_ptr; - arg->image.channels = (uint32_t)input->dims()[1]; - arg->image.height = (uint32_t)input->dims()[2]; - arg->image.width = (uint32_t)input->dims()[3]; - arg->image.pad_height = (uint32_t)padding_h; - arg->image.pad_width = (uint32_t)padding_w; - arg->image.scale_address = input->scale; - arg->output.address = output_ptr; - arg->output.scale_address = out->scale; -} // end dwconv arg fill - -void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int stride_h, - int stride_w, int padding_h, int padding_w, - float *bias_ptr) { - auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - - auto deleter = [](void *p) { fpga_free(p); }; - - arg->group_num = (uint32_t)filter->dims()[0]; - arg->sub_conv_num = (uint32_t)stride_w; - arg->filter_num = (uint32_t)filter->dims()[0]; - - int sub_conv_num = stride_w; - - int sub_pad = - deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], // NOLINT - padding_w, stride_w); - auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis( - (int)filter->dims()[3], stride_w); // NOLINT - - auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[3], sub_pad, sub_filter_width); // NOLINT - auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[2], sub_pad, sub_filter_width); // NOLINT - - arg->sub_output_width = (uint32_t)sub_output_width; - arg->sub_output_height = (uint32_t)sub_output_height; - arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( - stride_w, (int)filter->dims()[3], padding_w); // NOLINT - - auto sub_channels = (int)input->dims()[1]; // NOLINT - uint32_t omit_size = arg->omit_size; - int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; - int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; - int sub_filter_num = sub_conv_num * (arg->filter_num); - - framework::DDim dims_out_new = framework::make_ddim( - {1, arg->filter_num, real_out_height, real_out_width}); - fpga::format_fp16_ofm(out, dims_out_new); - auto out_ptr = out->data(); - - /*====For Addition - arg->output.address = - (half *)out_ptr + // NOLINT - omit_size * sizeof(half) * - (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); - */ - arg->output.address = out_ptr; - arg->output.scale_address = out->scale; - - int filter_offset = sub_filter_width * sub_filter_width * - align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) * - arg->sub_conv_num; - - for (int i = 0; i < sub_conv_num; ++i) { - arg->dw_conv_args.push_back(std::make_shared()); - - arg->dw_conv_args[i]->sub_conv_num = sub_conv_num; - // arg->dw_conv_args[i]->relu_enabled = relu_enabled; - arg->dw_conv_args[i]->output.activation.activation_type = activation_enable; - arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; - arg->dw_conv_args[i]->bias_address = bias_ptr; - - arg->dw_conv_args[i]->filter_address = - fpga_malloc(filter_offset * sizeof(int16_t)); - memcpy(arg->dw_conv_args[i]->filter_address, - (reinterpret_cast(filter_ptr) + i * filter_offset), - filter_offset * sizeof(int16_t)); - arg->vector_dw_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->dw_conv_args[i]->filter_address), - deleter)); - - arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width; - arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width; - - arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1; - arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1; - arg->dw_conv_args[i]->image.address = input_ptr; - arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1]; - arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2]; - arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3]; - - arg->dw_conv_args[i]->image.pad_height = sub_pad; - arg->dw_conv_args[i]->image.pad_width = sub_pad; - arg->dw_conv_args[i]->image.scale_address = input->scale; - - arg->dw_conv_args[i]->output.address = - fpga_malloc(sub_output_height * - align_to_x(sub_output_width * sub_channels * sub_conv_num, - IMAGE_ALIGNMENT) * - sizeof(int16_t)); - arg->dw_conv_args[i]->output.scale_address = - static_cast(fpga_malloc(2 * sizeof(float))); - arg->vector_dw_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->dw_conv_args[i]->output.address), - deleter)); - arg->vector_dw_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->dw_conv_args[i]->output.scale_address), - deleter)); - } - - // arg->output.scale_address = out->scale; -} // end dwconv arg fill - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/api.h b/mobile/src/fpga/V1/api.h deleted file mode 100644 index 33a5d3d33f..0000000000 --- a/mobile/src/fpga/V1/api.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "fpga/common/fpga_common.h" -#include "fpga/common/pe.h" -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace fpga { - -void format_image(framework::Tensor* image_tensor); -void format_ofm(framework::Tensor* ofm_tensor); -void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory -void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); -void format_fp32_ofm(framework::Tensor* ofm_tensor); - -float filter_find_max(framework::Tensor* filter_tensor); -int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); -int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor, - int group_num, int stride); - -int get_plit_num(framework::Tensor* filter_tensor); -int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride); - -int get_aligned_filter_element_num(int chw); -void format_filter(framework::Tensor* filter_tensor, float max_value, - int group_num); -void format_fc_filter(framework::Tensor* filter_tensor, float max_value); -void format_bias_scale_array(float** bias_scale_array, - int element_num_per_division, int num); -void format_bias_array(float** bias_array, int num); -void format_concat_output(framework::Tensor* out, int height, int width, - int image_num, uint32_t* channel_num); - -void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int group_num, - int stride_h, int stride_w, int padding_h, int padding_w, - float* bs_ptr); -void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int group_num, - int stride_h, int stride_w, int padding_h, int padding_w, - float* bs_ptr); -void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int stride_h, - int stride_w, int padding_h, int padding_w, - float* bias_ptr); -void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - ActivationType activation_enable, - int16_t leaky_relu_negative_slope, int stride_h, - int stride_w, int padding_h, int padding_w, - float* bs_ptr); - -void format_deconv_filter(framework::Tensor* filter_tensor, float max_value, - int group_num, int stride); -void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr); -void format_conv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float** bs_ptr, int group); -void format_deconv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float** bs_ptr, - int group, int sub_conv_n); -void format_dwconv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float* scale_ptr, - float** bias_ptr); -void format_DWDeconv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float** bs_ptr, - int group, int sub_conv_n); - -template -void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) { - float data; - std::ofstream out(filename.c_str()); - for (int i = 0; i < dataSize; ++i) { - data = (((Dtype*)buffer)[i]); // NOLINT - out << data << std::endl; - } - out.close(); - return; -} - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/bias_scale.cpp b/mobile/src/fpga/V1/bias_scale.cpp deleted file mode 100644 index ffb5303c85..0000000000 --- a/mobile/src/fpga/V1/bias_scale.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V1/bias_scale.h" -#include -#include "fpga/common/fpga_common.h" - -namespace paddle_mobile { -namespace fpga { -namespace bias_scale { - -void align_element(float **data_in, int num_per_div_before_alignment, int num) { - int copynum = 0; - float *ptr_unaligned = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); - int num_element = - 2 * div_num * num_per_div_after_alignment; // including bias & scale - float *ptr_aligned = - (float *)fpga_malloc(num_element * sizeof(float)); // NOLINT - - memset(ptr_aligned, 0, num_element * sizeof(float)); - - for (int i = 0; i < div_num; i++) { - if (i == div_num - 1) { - copynum = (num_per_div_after_alignment * div_num > num) - ? (num % num_per_div_after_alignment) - : (num_per_div_before_alignment); - } else { - copynum = num_per_div_before_alignment; - } - - memcpy(ptr_aligned + i * num_per_div_after_alignment, - ptr_unaligned + num_per_div_before_alignment * i, - copynum * sizeof(float)); - memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment, - ptr_unaligned + num_per_div_before_alignment * i + num, - copynum * sizeof(float)); - } - - fpga_free(ptr_unaligned); - *data_in = ptr_aligned; -} - -void interleave(float **data_in, int num_after_alignment) { - // num_after_alignment: number of bias after alignment - - float *ptr_uninterleaved = *data_in; - float *ptr_interleaved = - (float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT - int num = num_after_alignment / 4; - for (int i = 0; i < num; i++) { - memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, - 4 * sizeof(float)); - memcpy(ptr_interleaved + 8 * i + 4, - ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float)); - } - - fpga_free(ptr_uninterleaved); - *data_in = ptr_interleaved; -} - -void format_bias_scale_array(float **bias_scale_array, - int element_num_per_division, int num) { - align_element(bias_scale_array, element_num_per_division, num); - int div_num = (num + element_num_per_division - 1) / element_num_per_division; - int element_num_after_division = - align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); - interleave(bias_scale_array, div_num * element_num_after_division); - fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float)); -} -void format_bias_array(float **bias_array, int num) { - float *ptr_unaligned = *bias_array; - int num_before_align = num; - int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT); - int16_t *ptr_aligned = - (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT - - memset(ptr_aligned, 0, num_after_align * sizeof(int16_t)); - for (int i = 0; i < num_before_align; i++) { - ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]); - } - *bias_array = (float *)ptr_aligned; // NOLINT - fpga_free(ptr_unaligned); -} - -} // namespace bias_scale -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/bias_scale.h b/mobile/src/fpga/V1/bias_scale.h deleted file mode 100755 index 9ebdc71bce..0000000000 --- a/mobile/src/fpga/V1/bias_scale.h +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace fpga { -namespace bias_scale { - -void align_element(float** data_in, int num_per_div_before_alignment, int num); -void interleave(float** data_in, int num_after_alignment); -void format_bias_scale_array(float** bias_scale_array, - int element_num_per_division, int num); -void format_bias_array(float** bias_array, int num); - -} // namespace bias_scale -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/deconv_bias_scale.cpp b/mobile/src/fpga/V1/deconv_bias_scale.cpp deleted file mode 100644 index 0bcc91ddd2..0000000000 --- a/mobile/src/fpga/V1/deconv_bias_scale.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V1/deconv_bias_scale.h" -// #include "deconv_bias_scale.h" -#include "fpga/V1/bias_scale.h" -// #include "bias_scale.h" -#include - -#include "fpga/V1/api.h" -// #include "fpga_api.h" -namespace paddle_mobile { -namespace fpga { -namespace deconv_bias_scale { - -void deconv_bias_scale_expand(float** bias_scale_array, int num, - int sub_conv_n) { - int sub_num = num * sub_conv_n; - float* ptr_tmp = *bias_scale_array; - float* ptr_bias_scale_expand = - (float*)fpga_malloc(sizeof(float) * sub_num * 2); - int scale_base_offset = sub_num; - for (int i = 0; i < sub_conv_n; ++i) { - int offset = num * i; - // copy bias - fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float)); - // copy scale - fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num, - num * sizeof(float)); - } - *bias_scale_array = ptr_bias_scale_expand; - fpga_free(ptr_tmp); -} - -} // namespace deconv_bias_scale -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/deconv_bias_scale.h b/mobile/src/fpga/V1/deconv_bias_scale.h deleted file mode 100644 index 820c6984d4..0000000000 --- a/mobile/src/fpga/V1/deconv_bias_scale.h +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace fpga { -namespace deconv_bias_scale { - -void deconv_bias_scale_expand(float** bias_scale_array, int num, - int sub_conv_n); - -} // namespace deconv_bias_scale -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/deconv_filter.cpp b/mobile/src/fpga/V1/deconv_filter.cpp deleted file mode 100644 index 36a02578bc..0000000000 --- a/mobile/src/fpga/V1/deconv_filter.cpp +++ /dev/null @@ -1,280 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V1/deconv_filter.h" -#include -#include -// #include "deconv_filter.h" -#include "fpga/V1/filter.h" -// #include "filter.h" -#include "fpga/V1/api.h" - -namespace paddle_mobile { -namespace fpga { -namespace deconv_filter { - -/* -inverse kernel weights of each channel for every filter -*/ -void deconv_inverse_filter(float** data_in, int num, int channel, int width, - int height) { - float* tmp = *data_in; - int data_size = num * channel * width * height; - int hw_len = height * width; - auto tmp_data = - reinterpret_cast(fpga_malloc(data_size * sizeof(float))); - for (int i = 0; i < num; ++i) { - for (int j = 0; j < channel; ++j) { - for (int k = 0; k < hw_len; ++k) { - tmp_data[i * channel * hw_len + j * hw_len + k] = - (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1]; - } - } - } - *data_in = tmp_data; - fpga_free(tmp); -} - -/* - calculate sub padding number -*/ -int deconv_calc_sub_pad(int filter_axis, int pad, int stride) { - if (stride == 0 || ((filter_axis - pad - 1) < 0)) { - PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters"); - } - return (filter_axis - pad - 1) / stride; -} -int deconv_get_sub_filter_axis(int filter_axis, int stride) { - return (filter_axis / stride); -} - -int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) { - return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1); -} - -/* - (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image - position. so the omit rows or columns is (stride - ) -*/ -int deconv_get_omit(int stride, int filter_width, int pad) { - PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters"); - int idx; - bool flag = false; - for (idx = 1; idx <= stride; ++idx) { - int j = idx; - for (; j <= filter_width;) { - if (j == filter_width - pad) { - flag = true; - break; - } - j = j + stride; - } - if (flag) { - break; - } - } - - return (stride - idx); -} - -template -void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n, - int kernel_num, int channel) { - T* ptr_tmp = *data_in; - int sub_num = kernel_num * sub_conv_n; - int sub_h = height / sub_conv_n; - int sub_w = width / sub_conv_n; - - int sub_filter_size = - kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n; - - T* ptr_sub_filter = - reinterpret_cast(fpga_malloc(sub_filter_size * sizeof(T))); - for (int idx = 0; idx < sub_conv_n; ++idx) { - for (int nn = 0; nn < sub_num; ++nn) { - int ni = nn % kernel_num; - - int woff = sub_conv_n - 1 - (nn / kernel_num); // - - for (int hh = 0; hh < sub_h; ++hh) { - int hi = hh * sub_conv_n + idx % sub_conv_n; - for (int ww = 0; ww < sub_w; ++ww) { - int wi = ww * sub_conv_n + woff; // 1 0 - - int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel; // - int kidx = ((ni * height + hi) * width + wi) * channel; // - - fpga_copy( - ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx, - (*data_in) + kidx, channel * sizeof(T)); - // for (int cc =0; cc < channel; ++cc) { - // ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] = - // (*data_in)[kidx + cc]; - // } - } - } - } - } - *data_in = ptr_sub_filter; - fpga_free(ptr_tmp); -} - -void deconv_NC_convert(float** filter_in, int kernel_num, int channels, - int hw) { - float* tmp = *filter_in; - float* ptr_filter = reinterpret_cast(paddle_mobile::fpga::fpga_malloc( - hw * kernel_num * channels * sizeof(float))); - - for (int c = 0; c < channels; ++c) { - for (int n = 0; n < kernel_num; ++n) { - paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c, - tmp + n * channels * hw + c * hw, - hw * sizeof(float)); - } - } - *filter_in = ptr_filter; - paddle_mobile::fpga::fpga_free(tmp); -} - -void deconv_format_filter(float** data_in, int num, int channel, int height, - int width, int group_num, float max, int stride) { - int data_size = channel * height * width * num; - - /*{ - float result2 = (float)0; - string filename = "origin_filter_data"; - api::savefile(filename, (void *)*data_in, data_size, result2); - }*/ - - deconv_inverse_filter(data_in, num, channel, width, height); - - /* { - float result2 = (float)0; - string filename = "inverse_filter_data"; - api::savefile(filename, (void *)*data_in, data_size, result2); - }*/ - - filter::quantize(data_in, data_size, max); - /* { - char result2 = (char)0; - string filename = "quantize_filter_data"; - api::savefile(filename, (void *)*data_in, data_size, result2); - }*/ - char** quantize_data = (char**)data_in; // NOLINT - - filter::convert_to_hwc(quantize_data, num, channel, height, width); - /*{ - char result2 = (char)0; - string filename = "convert_to_hwc_filter_data"; - api::savefile(filename, (void *)*quantize_data, data_size, - result2); - }*/ - - deconv_get_sub_filter(quantize_data, height, width, stride, num, - channel); - /*{ - char result2 = (char)0; - string filename = "sub_filter_filter_data"; - api::savefile(filename, (void *)*quantize_data, data_size, result2); -}*/ - - int sub_conv_n = stride; - int sub_h = height / sub_conv_n; - int sub_w = width / sub_conv_n; - int sub_chw = sub_h * sub_w * channel; - int sub_num = sub_conv_n * num; - int division_capacity = filter::calc_division_capacity(sub_chw); - int num_per_div_before_alignment = - filter::calc_num_per_div(sub_num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = (sub_num + num_per_div_before_alignment - 1) / - num_per_div_before_alignment; - int residual = (sub_num) % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - - char** ptr_ptr_data = - reinterpret_cast(fpga_malloc(sub_conv_n * sizeof(char*))); - int origin_offset = sub_chw * sub_num; - for (int i = 0; i < sub_conv_n; ++i) { - (ptr_ptr_data)[i] = - reinterpret_cast(fpga_malloc(origin_offset * sizeof(char))); - fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i, - origin_offset * sizeof(char)); - - /* char result2 = (char)0; - string filename = "ptr_ptr_data" + to_string(i); - api::savefile(filename, (void *)(ptr_ptr_data[i]), origin_offset, - result2); - */ - } - // char result2 = (char)0; - // string filename = "interleave"; - // api::savefile(filename, (void *)*ptr_ptr_data, origin_offset, - // result2); - fpga_free(*quantize_data); - - int align_offset = - align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment; - char* ptr_space = reinterpret_cast(fpga_malloc( - sub_conv_n * align_offset * sizeof(char))); // continuous space - for (int i = 0; i < sub_conv_n; ++i) { - char* ptr_tmp = (ptr_ptr_data)[i]; - - filter::align_element(&ptr_tmp, sub_num, sub_chw); - filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw); - - filter::reorder(&ptr_tmp, num_after_alignment, sub_chw); - filter::interleave(&ptr_tmp, num_after_alignment, sub_chw); - - /* char result2 = (char)0; - string filename = "interleave" + to_string(i); - api::savefile(filename, (void *)ptr_tmp, align_offset, result2); -*/ - fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset); - fpga_free(ptr_tmp); - } - fpga_free(ptr_ptr_data); - *data_in = reinterpret_cast(ptr_space); - - /* { - char result2 = (char)0; - string filename = "ptr_space"; - api::savefile(filename, (void *)ptr_space, sub_conv_n * - align_offset, result2); - }*/ - fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char)); -} - -void DWDconv_format_filter(float** data_in, int num, int channel, int height, - int width, float* scale_ptr, int stride) { - deconv_inverse_filter(data_in, num, channel, width, height); - - filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr); - int16_t** quantize_data = (int16_t**)data_in; // NOLINT - filter::convert_to_hwn(quantize_data, channel, height, width); - - deconv_get_sub_filter(quantize_data, height, width, stride, num, - channel); - - filter::align_element_n(quantize_data, channel, height, width); - fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) * - height * width * sizeof(int16_t)); -} - -} // namespace deconv_filter -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/deconv_filter.h b/mobile/src/fpga/V1/deconv_filter.h deleted file mode 100644 index f1a50b95c5..0000000000 --- a/mobile/src/fpga/V1/deconv_filter.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace fpga { -namespace deconv_filter { - -void deconv_inverse_filter(float** data_in, int num, int channel, int width, - int height); -int deconv_calc_sub_pad(int filter_axis, int pad, int stride); -int deconv_get_sub_filter_axis(int filter_axis, int stride); -int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis); -int deconv_get_omit(int stride, int filter_width, int pad); - -template -void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n, - int kernel_num, int channel); -void deconv_format_filter(float** data_in, int num, int channel, int height, - int width, int group_num, float max, int stride); -void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw); -void DWDconv_format_filter(float** data_in, int num, int channel, int height, - int width, float* scale_ptr, int stride); - -} // namespace deconv_filter -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/filter.cpp b/mobile/src/fpga/V1/filter.cpp deleted file mode 100644 index 425d1d1b5c..0000000000 --- a/mobile/src/fpga/V1/filter.cpp +++ /dev/null @@ -1,362 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V1/filter.h" -#include -#include -#include "fpga/common/fpga_common.h" - -namespace paddle_mobile { -namespace fpga { -namespace filter { - -int calc_division_capacity(int chw) { - int n = 2048 / ((chw + 15) / 16) * 32; - return n < 2048 ? n : 2048; -} - -int calc_split_num(int num, int division_capacity) { - return (num + division_capacity - 1) / division_capacity; -} - -int calc_division_number(int num, int group_num, int division_capacity) { - // PADDLE_MOBILE_ENFORCE(num % group_num == 0, - // "Filter number should be divisible by group - // number"); - int split_num = calc_split_num(num, division_capacity); - // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - // "Split number or group number should be 1"); - return group_num * split_num; -} - -int calc_num_per_div(int num, int group_num, int division_capacity) { - // PADDLE_MOBILE_ENFORCE(num % group_num == 0, - // "Filter number should be divisible by group - // number"); - int split_num = calc_split_num(num, division_capacity); - // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - // "Split number or group number should be 1"); - if (group_num == 1) { - if (num > division_capacity) { - return division_capacity; - } else { - return num; - } - } else { - return (num + group_num - 1) / group_num; - } -} - -void convert_to_hwc(char **data_in, int num, int channel, int height, - int width) { - char *tmp = *data_in; - int chw = channel * height * width; - char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT - for (int n = 0; n < num; n++) { - int64_t amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + n * chw + offset_height + w * channel + c) = - *((*data_in)++); - } - } - } - } - - *data_in = data_tmp; - fpga_free(tmp); -} - -float find_max(float *data_in, int data_size) { - float max = 0.0; - for (int i = 0; i < data_size; ++i) { - float value = data_in[i]; - float abs = value > 0 ? value : -value; - max = std::max(max, abs); - } - return max; -} - -signed char float_to_int8(float fdata) { - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - return (signed char)fdata; -} - -void quantize(float **data_in, int data_size, float max) { - float *tmp = *data_in; - float fix_range = 127; - float scale = fix_range / max; - - signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = float_to_int8( - (*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale); - } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} - -void align_element(char **data_in, int num, int chw) { - int i = 0; - int j = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - if (align_chw != chw) { - char *tmp = *data_in; - char *data_tmp = - (char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num * align_chw); - for (j = 0; j < num; j++) { - memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw); - } - *data_in = data_tmp; - fpga_free(tmp); - } -} - -void align_num(char **data_in, int num_per_div_before_alignment, int num, - int chw) { - int i = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - - char *tmp = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_element = div_num * num_per_div_after_alignment * align_chw; - char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(char)); - - for (i = 0; i < div_num - 1; i++) { - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - num_per_div_before_alignment * align_chw); - } - - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - (num - (div_num - 1) * num_per_div_before_alignment) * align_chw); - - *data_in = data_tmp; - fpga_free(tmp); -} - -void reorder(char **data_in, int num_after_alignment, int chw) { - int index = 0; - int new_index; - - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; - for (index = 0; index < num_after_alignment; index++) { - new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) + - (index / 16 % 2 * 4); - memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align, - chw_align); - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void interleave(char **data_in, int num_after_alignment, int chw) { - int i = 0; - int j = 0; - int k = 0; - int interleave_per_num = 16; - - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; - int interleave_num = chw_align * 2 / interleave_per_num; - for (i = 0; i < num_after_alignment; i += 2) { - for (j = 0, k = 0; j < interleave_num; j += 2, k++) { - memcpy(data_tmp + i * chw_align + interleave_per_num * j, - *data_in + i * chw_align + interleave_per_num * k, - interleave_per_num); - memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1), - *data_in + (i + 1) * chw_align + interleave_per_num * k, - interleave_per_num); - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void format_filter(float **data_in, int num, int channel, int height, int width, - int group_num, float max) { - int data_size = channel * height * width * num; - int chw = channel * height * width; - - int division_capacity = calc_division_capacity(chw); - int num_per_div_before_alignment = - calc_num_per_div(num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int residual = num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_to_hwc(quantize_data, num, channel, height, width); - align_element(quantize_data, num, chw); - if (num_after_alignment != num) { - align_num(quantize_data, num_per_div_before_alignment, num, chw); - } - - reorder(quantize_data, num_after_alignment, chw); - interleave(quantize_data, num_after_alignment, chw); - fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * - num_after_alignment * sizeof(char)); -} - -void convert_fc_filter(char **data_in, int num, int chw) { - char *tmp = *data_in; - char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT - for (int n = 0; n < num; n++) { - for (int c = 0; c < chw; c++) { - data_tmp[n * chw + c] = (*data_in)[num * c + n]; - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void format_fc_filter(float **data_in, int num, int channel, int height, - int width, int group_num, float max) { - int data_size = channel * height * width * num; - int chw = channel * height * width; - - int division_capacity = calc_division_capacity(chw); - int num_per_div_before_alignment = - calc_num_per_div(num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int residual = num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - - quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_fc_filter(quantize_data, num, chw); - convert_to_hwc(quantize_data, num, channel, height, width); - align_element(quantize_data, num, chw); - if (num_after_alignment != num) { - align_num(quantize_data, num_per_div_before_alignment, num, chw); - } - reorder(quantize_data, num_after_alignment, chw); - interleave(quantize_data, num_after_alignment, chw); - fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * - num_after_alignment * sizeof(char)); -} -void convert_to_hwn(int16_t **data_in, int num, int height, int width) { - int16_t *tmp = *data_in; - int16_t *data_tmp = - (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - *(data_tmp + h * width * num + w * num + n) = *((*data_in)++); - } - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void align_element_n(int16_t **data_in, int num, int height, int width) { - int unalign_n = num; - int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT); - if (unalign_n == align_n) { - return; - } else { - int16_t *tmp = *data_in; - - int num_element = height * width * align_n; - int16_t *data_tmp = - (int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(int16_t)); - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int offset_unalign = h * width * unalign_n + w * unalign_n; - int offset_align = h * width * align_n + w * align_n; - for (int n = 0; n < unalign_n; n++) { - data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n); - } - } - } - - *data_in = data_tmp; - fpga_free(tmp); - } -} -void quantize_to_fp16(float **data_in, int num, int height, int width, - float *scale_ptr) { - float *tmp = *data_in; - int size = num * height * width; - - int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t)); // NOLINT - for (int n = 0; n < num; n++) { - float scale_val = scale_ptr[n]; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int index = n * height * width + h * width + w; - tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val); - } - } - } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} -void format_dwconv_filter(float **data_in, int num, int height, int width, - float *scale_ptr) { - quantize_to_fp16(data_in, num, height, width, scale_ptr); - int16_t **quantize_data = (int16_t **)data_in; // NOLINT - convert_to_hwn(quantize_data, num, height, width); - align_element_n(quantize_data, num, height, width); - fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * - height * width * sizeof(int16_t)); -} - -void format_DWDeconv_filter(float **data_in, int num, int height, int width, - float *scale_ptr) { - quantize_to_fp16(data_in, num, height, width, scale_ptr); - int16_t **quantize_data = (int16_t **)data_in; // NOLINT - convert_to_hwn(quantize_data, num, height, width); - align_element_n(quantize_data, num, height, width); - fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * - height * width * sizeof(int16_t)); -} -} // namespace filter -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/filter.h b/mobile/src/fpga/V1/filter.h deleted file mode 100755 index 4812a75af2..0000000000 --- a/mobile/src/fpga/V1/filter.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -namespace paddle_mobile { -namespace fpga { -namespace filter { - -int calc_division_capacity(int chw); -int calc_split_num(int num, int division_capacity); -int calc_division_number(int num, int group_num, int division_capacity); -int calc_num_per_div(int num, int group_num, int division_capacity); -void convert_to_hwc(char** data_in, int num, int channel, int height, - int width); -float find_max(float* data_in, int data_size); -void quantize(float** data_in, int data_size, float max); -void align_element(char** data_in, int num, int chw); -void align_num(char** data_in, int num_per_div_before_alignment, int num, - int chw); -void reorder(char** data_in, int num_after_alignment, int chw); -void interleave(char** data_in, int num_after_alignment, int chw); -void format_filter(float** data_in, int num, int channel, int height, int width, - int group_num, float max); - -void convert_fc_filter(char** data_in, int num, int chw); -void format_fc_filter(float** data_in, int num, int channel, int height, - int width, int group_num, float max); - -void convert_to_hwn(int16_t** data_in, int num, int height, int width); -void align_element_n(int16_t** data_in, int num, int height, int width); -void quantize_to_fp16(float** data_in, int num, int height, int width, - float* scale_ptr); -void format_dwconv_filter(float** data_in, int num, int height, int width, - float* scale_ptr); - -} // namespace filter -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/image.cpp b/mobile/src/fpga/V1/image.cpp deleted file mode 100644 index 4ba5af83ab..0000000000 --- a/mobile/src/fpga/V1/image.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V1/image.h" - -namespace paddle_mobile { -namespace fpga { -namespace image { - -void convert_to_hwc(float **data_in, int channel, int height, int width, - int num) { - float *data_tmp = reinterpret_cast( - fpga_malloc(num * channel * height * width * sizeof(float))); - int64_t amount_per_row = width * channel; - for (int n = 0; n < num; n++) { - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + n * channel * height * width + offset_height + - w * channel + c) = *((*data_in)++); - } - } - } - } - *data_in = data_tmp; -} - -void convert_to_chw(float **data_in, int channel, int height, int width, - int num) { - float *data_tmp = - (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT - int64_t amount_per_side = width * height; - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + n * height * width * channel + c * amount_per_side + - width * h + w) = *((*data_in)++); - } - } - } - } - *data_in = data_tmp; -} - -void concat_images(int16_t **images_in, float **scales_in, void *image_out, - float *scale_out, int image_num, uint32_t *channel_num, - int height, int width) { - int i = 0; - int j = 0; - int k = 0; - int each_out_line_channel = 0; - int align_each_out_area_cw = 0; - int align_each_in_area_cw = 0; - int align_each_out_area_cw_differ = 0; - int tmp_channel = 0; - scale_out[0] = 0.0; - scale_out[1] = 0.0; - for (i = 0; i < image_num; i++) { - each_out_line_channel += channel_num[i]; - scale_out[0] = std::max(*scale_out, scales_in[i][0]); - fpga_invalidate(images_in[i], - height * - align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * - sizeof(int16_t)); - } - scale_out[1] = 1 / scale_out[0]; - align_each_out_area_cw = - align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); - align_each_out_area_cw_differ = - align_each_out_area_cw - each_out_line_channel * width; - - for (k = 0; k < height; k++) { - for (j = 0; j < width; j++) { - for (i = 0; i < image_num; i++) { - align_each_in_area_cw = - align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); - memcpy((int16_t *)image_out + tmp_channel + // NOLINT - k * align_each_out_area_cw_differ, - images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, - channel_num[i] * sizeof(int16_t)); - - tmp_channel += channel_num[i]; - } - } - } - - fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); -} - -void split_image(int16_t *image_in, const float *scale_in, void **images_out, - float **scales_out, int image_num, - const uint32_t *channel_nums, int height, int width) { - int total_channel = 0; - for (int i = 0; i < image_num; i++) { - scales_out[i][0] = scale_in[0]; - scales_out[i][1] = scale_in[1]; - total_channel += channel_nums[i]; - } - int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT); - fpga_invalidate(image_in, element_num * sizeof(int16_t)); - - int src_offset = 0, des_offset = 0; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) + - w * total_channel; - for (int i = 0; i < image_num; i++) { - des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + - w * channel_nums[i]; - memcpy(reinterpret_cast(images_out[i]) + des_offset, - image_in + src_offset, channel_nums[i] * sizeof(int16_t)); - src_offset += channel_nums[i]; - } - } - } - - for (int i = 0; i < image_num; i++) { - element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT); - fpga_flush(images_out[i], element_num * sizeof(int16_t)); - } -} - -} // namespace image -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/image.h b/mobile/src/fpga/V1/image.h deleted file mode 100644 index f5dc6ffe3e..0000000000 --- a/mobile/src/fpga/V1/image.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "fpga/common/fpga_common.h" -namespace paddle_mobile { -namespace fpga { -namespace image { - -void convert_to_hwc(float** data_in, int channel, int height, int width, - int num = 1); -void convert_to_chw(float** data_in, int channel, int height, int width, - int num = 1); -// template -// void align_element_conv(Dtype** data_in, int height, int cw); -// template -// void format_image(T** data_in, int channel, int height, int width); -template -void align_element_conv(Dtype** data_in, int height, int cw); -template -void align_element_conv(Dtype** data_in, int height, int cw) { - int h = 0; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - - Dtype* data_tmp = - (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype)); // NOLINT - - memset(data_tmp, 0, height * align_cw * sizeof(Dtype)); - - for (h = 0; h < height; h++) { - memcpy((void*)(data_tmp + h * align_cw), // NOLINT - (void*)(*data_in + h * cw), // NOLINT - cw * sizeof(Dtype)); - } - - *data_in = data_tmp; -} -template -void format_image(T** data_in, int channel, int height, int width) { - int cw = channel * width; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - if (align_cw != cw) { - T* hwc_temp = *data_in; - align_element_conv(data_in, height, channel * width); - fpga_free(hwc_temp); - } - fpga_flush(*data_in, - align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T)); -} -// Concat featuremaps along channel direction -void concat_images(int16_t** images_in, float** scales_in, void* image_out, - float* scale_out, int image_num, uint32_t* channel_num, - int height, int width); - -// Split featuremap along channel direction -void split_image(int16_t* image_in, const float* scale_in, void** images_out, - float** scales_out, int image_num, - const uint32_t* channel_nums, int height, int width); -} // namespace image -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V1/pe.cpp b/mobile/src/fpga/V1/pe.cpp deleted file mode 100644 index fef971a348..0000000000 --- a/mobile/src/fpga/V1/pe.cpp +++ /dev/null @@ -1,1180 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/common/pe.h" -#include "common/enforce.h" -#include "common/types.h" -#include "fpga/V1/filter.h" -#include "fpga/V1/image.h" -#include "fpga/common/config.h" -#include "fpga/common/driver.h" -#include "fpga/common/fpga_common.h" -#ifdef COST_TIME_PRINT -#include -#include -#include -#include -#endif - -namespace paddle_mobile { -namespace fpga { - -using namespace driver; // NOLINT -using namespace std; // NOLINT -#define USE_RELU 1 -#define USE_BIAS 2 - -// bypass cmd -#define CMD_FP16_TO_FP16 0 -#define CMD_FP16_TO_FP32 1 -#define CMD_FP32_TO_FP16 2 -#define CMD_FP32_TO_FP32 3 -#define CMD_INT8_TO_FP16 4 - -// bypass macro -#define SIZE_FP16 2 -#define SIZE_FP32 4 -#define SIZE_INT8 1 - -#define PE_IRQ_TIMEOUT 1000000 - -/* Interrupt bit-set offset*/ -#define INTERRUPT_RSVD 0x0001 -#define INTERRUPT_BYPASS 0x0002 -#define INTERRUPT_CONV 0x0004 -#define INTERRUPT_POOLING 0x0008 -#define INTERRUPT_EW 0x0010 - -/* Register offset */ -#define REG_INTERRUPT 0x000 -#define REG_VERSION 0x008 -#define REG_TEMPERATURE 0x010 -#define REG_FPGA_RESET 0x018 -#define REG_TEST_REGISTER 0x048 -#define REG_HARDWARE_STATUS 0x050 - -#define REG_TIMER_COUNTER 0x070 - -#define REG_SCALE_PARAMETER 0x080 -#define REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR 0x090 - -#define REG_FLASH_CMD 0x200 -#define REG_FLASH_DATA 0x208 -#define REG_FLASH_CONFIG 0x210 -#define REG_FLASH_STATUS 0x218 -#define REG_SN 0x220 - -/*bypass*/ -#define REG_CONVERT_CMD 0x400 -#define REG_CONVERT_SRC_ADDR 0x408 -#define REG_CONVERT_DST_ADDR 0x410 -#define REG_CONVERT_LENGTH 0x418 - -/*resize*/ -#define REG_RESIZE_CMD 0x600 -#define REG_RESIZE_CHANNEL_NUMBER 0x608 -#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610 -#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618 -#define REG_RESIZE_INPUT_BASE_ADDR 0x620 -#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628 -#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630 -#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638 - -/*pooling*/ -#define REG_POOLING_CMD 0x800 -#define REG_POOLING_IMAGE_BASE_ADDR 0x808 -#define REG_POOLING_RESULT_BASE_ADDR 0x810 -#define REG_POOLING_IMAGE_PIXEL 0x818 -#define REG_POOLING_WINDOW_SIZE 0x820 -#define REG_POOLING_RESULT_PIXEL 0x828 -#define REG_POOLING_PAD_PIXEL 0x830 -#define REG_POOLING_STEP_PIXEL 0x838 -#define REG_POOLING_CHANNEL_NUMBER 0x840 -#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848 -#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850 -#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858 -#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860 -#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 -#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 -#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 -#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880 -#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 -#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 -#define REG_POOLING_MODE_RECIPROCAL 0x890 - -/*conv*/ -#define REG_CONV_CMD 0xC00 -#define REG_CONV_IMAGE_BASE_ADDR 0xC08 -#define REG_CONV_FILTER_BASE_ADDR 0xC10 -#define REG_CONV_SB_BASE_ADDR 0xC18 -#define REG_CONV_RESULT_BASE_ADDR 0xC20 -#define REG_CONV_IMAGE_PIXEL 0xC28 -#define REG_CONV_FILTER_PIXEL 0xC30 -#define REG_CONV_RESULT_PIXEL 0xC38 -#define REG_CONV_PAD_PIXEL 0xC40 -#define REG_CONV_STEP_PIXEL 0xC48 -#define REG_CONV_GROUP_NUMBER 0xC50 -#define REG_CONV_FILTER_NUMBER 0xC58 -#define REG_CONV_CHANNEL_NUMBER 0xC60 -#define REG_CONV_FILTER_PER_GROUP 0xC68 -#define REG_CONV_CHANNEL_PER_GROUP 0xC70 -#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78 -#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80 -#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88 -#define REG_CONV_FILTER_AMOUNT_ALL 0xC90 -#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98 -#define REG_CONV_RESULT_LAST_VALID 0xCA0 - -#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8 -#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0 -#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8 -#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0 -#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8 -#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0 -#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8 -#define REG_CONV_IMAGE_WIN_CNT 0xCE0 -#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8 -#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8 -#define REG_CONV_PROG_FULL_CNT 0xD08 -#define REG_CONV_POST_PROG_FULL_CNT 0xD10 -#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20 - -#define REG_CONV_IMAGE_SCALE 0xD28 -#define REG_CONV_FILTER_SCALE 0xD30 - -/*ew*/ -#define REG_EW_CMD 0x0F00 -#define REG_EW_IMAGE0_BASE_ADDR 0x0F08 -#define REG_EW_IMAGE1_BASE_ADDR 0x0F10 -#define REG_EW_RESULT_BASE_ADDR 0x0F18 -#define REG_EW_DATA_LEN 0x0F20 -#define REG_EW_COEFFICIENT 0x0F28 -#define REG_EW_IMAGE_PIXEL 0x0F30 -#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38 - -/*dwconv*/ -#define REG_DWCONV_FILTER_BASE_ADDR 0xe08 -#define REG_DWCONV_FILTER_SHAPE 0xe10 -#define REG_DWCONV_FILTER_N_ALIGN 0xe18 -#define REG_DWCONV_FILTER_SUBNUMBER 0xe20 -#define REG_DWCONV_CMD 0xe00 - -int ComputeFpgaConv(const struct SplitConvArgs &args) { -// ComputeBasicConv(args.conv_arg[0]); -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFPGAConv==========="; - DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num - << " split_num:" << args.split_num; -#endif - int ret = 0; - int split_num = args.split_num; - for (int i = 0; i < split_num; i++) { - ret |= ComputeBasicConv(args.conv_arg[i]); - } - - if (split_num > 1) { - ComputeFPGAConcat(args.concat_arg); - } - - return ret; -} - -int ComputeBasicConv(const struct ConvArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "======Compute Basic Conv======"; - // DLOG << " relu_enabled:" << args.relu_enabled - DLOG << " sb_address:" << args.sb_address - << " filter_address:" << args.filter_address - << " filter_num:" << args.filter_num - << " group_num:" << args.group_num; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - -#ifdef PADDLE_MOBILE_ZU5 - int ret = 0; - uint64_t output_scale = 0; - - uint64_t reg_ActivationArgs = 0; - // active function:{none,leakeyrelu,sigmoid,tanh} - ActivationArgs active_args; - // active_args.activation_type = LEAKYRELU; - - active_args.activation_type = args.output.activation.activation_type; - - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; - - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; - - DLOG << " activation_type:" << active_args.activation_type - << " leaky_relu_negative_slope:" - << active_args.leaky_relu_negative_slope; - DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { - ret = -EIO; - DLOG << "Conv Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_CONV_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_CONV_FILTER_PIXEL); - - uint64_t output_height_fraction = - args.driver.output_height / ROW_PARALLEL_NUM; - uint64_t output_height_remainder = - args.driver.output_height % ROW_PARALLEL_NUM; - reg_writeq(args.driver.output_height | (output_height_fraction << 16) | - (output_height_remainder << 26) | - (args.driver.output_width << 32), - REG_CONV_RESULT_PIXEL); - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_CONV_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_CONV_STEP_PIXEL); - reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); - reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); - reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); - reg_writeq(*(uint64_t *)args.image.scale_address, // NOLINT - REG_CONV_IMAGE_SCALE); - reg_writeq(*(uint64_t *)args.filter_scale_address, // NOLINT - REG_CONV_FILTER_SCALE); - reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR); - reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR); - reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR); - reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR); - reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP); - reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP); - reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); - reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); - reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); - reg_writeq(args.driver.image_block_amount_per_row, 0xca8); - reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0); - reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8); - reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0); - reg_writeq(args.driver.image_block_num, 0xcc8); - reg_writeq(args.driver.image_block_len, 0xcd0); - reg_writeq(args.driver.image_block_len_last, 0xcd8); - reg_writeq(args.driver.image_win_cnt, 0xce0); - reg_writeq(args.driver.image_win_cnt_last, 0xce8); - reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8); - reg_writeq(args.driver.prog_full_cnt, 0xd08); - reg_writeq(args.driver.post_prog_full_cnt, 0xd10); - reg_writeq(args.driver.deconv_param, 0xd18); - reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); - reg_writeq(args.driver.cmd, REG_CONV_CMD); - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; - ret = -EIO; - DLOG << "Conv Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout"); - } - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - - active_args.activation_type = NONE; - reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - - return ret; -#endif - return 0; -} // ComputeBasicConv - -int ComputeFpgaPool(const struct PoolingArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaPool==========="; - DLOG << " mode:" << args.mode - << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif -#ifdef PADDLE_MOBILE_ZU5 - DLOG << "Polling"; - // return 0; - uint64_t output_scale = 0; - uint64_t timer_cnt = 0; - int ret = 0; - uint64_t cmd = 0; - uint64_t image_physical_address = 0; - uint64_t output_physical_address = 0; - - uint64_t reg_ActivationArgs = 0; - // active function:{none,leakeyrelu,sigmoid,tanh} - ActivationArgs active_args; - // active_args.activation_type = LEAKYRELU; - active_args.activation_type = args.output.activation.activation_type; - - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; - - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; - - DLOG << " activation_type:" << active_args.activation_type - << " leaky_relu_negative_slope:" - << active_args.leaky_relu_negative_slope; - DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - - image_physical_address = vaddr_to_paddr_driver(args.image.address); - output_physical_address = vaddr_to_paddr_driver(args.output.address); - uint32_t output_height = (uint32_t)( - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint32_t output_width = (uint32_t)( - (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1); - uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_one_pad_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT) + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - uint64_t image_two_pad_per_row = align_to_x( - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * - (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_row_mul_pooling_hight = - image_amount_per_row * (uint64_t)args.kernel.height; - uint64_t image_row_mul_pad_hight = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_row_mul_step_hight = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT); - uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; - uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - uint64_t image_padleft_skipwindow = - (image_skip_window << 32) | image_pad_left; - uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | - (((uint64_t)args.kernel_reciprocal)); - - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { - ret = -EIO; - DLOG << "Conv Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); - reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_POOLING_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_POOLING_WINDOW_SIZE); - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), - REG_POOLING_RESULT_PIXEL); - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_POOLING_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_POOLING_STEP_PIXEL); - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - reg_writeq(image_row_mul_pooling_hight, - REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); - reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); - reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); - reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); - reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL); - reg_writeq(cmd, REG_POOLING_CMD); - - DLOG << "before reg poll"; - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; - ret = -EIO; - DLOG << "Pooling Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "Pooling Wait Irq Timeout!"); - } - DLOG << "after reg poll"; - - // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - - active_args.activation_type = NONE; - reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - - return ret; -#endif - return 0; -} // ComputeFpgaPool - -int ComputeFpgaEWAdd(const struct EWAddArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaEWAdd==========="; - // DLOG << " relu_enabled:" << args.relu_enabled - DLOG << " const0:" << fp16_2_fp32(int16_t(args.const0)) - << " const1:" << fp16_2_fp32(int16_t(args.const1)); - DLOG << " image0_address:" << args.image0.address - << " image0_scale_address:" << args.image0.scale_address - << " image0_channels:" << args.image0.channels - << " image0_height:" << args.image0.height - << " image0_width:" << args.image0.width - << " pad0_height:" << args.image0.pad_height - << " pad0_width:" << args.image0.pad_width; - DLOG << " image1_address:" << args.image1.address - << " image1_scale_address:" << args.image1.scale_address - << " image1_channels:" << args.image1.channels - << " image1_height:" << args.image1.height - << " image1_width:" << args.image1.width - << " pad1_height:" << args.image1.pad_height - << " pad_width:" << args.image1.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif -#ifdef PADDLE_MOBILE_ZU5 - int ret = 0; - uint64_t output_scale = 0; - - uint64_t reg_ActivationArgs = 0; - ActivationArgs active_args; - active_args.activation_type = args.output.activation.activation_type; - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; - DLOG << " activation_type:" << active_args.activation_type - << " leaky_relu_negative_slope:" - << active_args.leaky_relu_negative_slope; - DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { - ret = -EIO; - DLOG << "EW Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); - reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); - reg_writeq(args.driver.datalen, REG_EW_DATA_LEN); - reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL); - reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); - reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR); - reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT); - reg_writeq(args.driver.cmd, REG_EW_CMD); - - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR; - ret = -EIO; - DLOG << "EW Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!"); - } - - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - active_args.activation_type = NONE; - reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; -#endif - return 0; -} // ComputeFpgaEWAdd - -int PerformBypass(const struct BypassArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaBypass==========="; - DLOG << " input_type:" << args.input_data_type - << " output_type:" << args.output_data_type - << " input_layout_type:" << args.input_layout_type - << " output_layout_type:" << args.output_layout_type; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif -#ifdef PADDLE_MOBILE_ZU5 - uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); - uint64_t output_scale = 0; - uint64_t timer_cnt = 0; - uint64_t cmd = 0; - uint64_t datalen = 0; - uint64_t input_address_phy = 0; - uint64_t output_address_phy = 0; - uint8_t data_cell_in = 0; - uint8_t data_cell_out = 0; - int ret = 0; - - uint64_t reg_ActivationArgs = 0; - ActivationArgs active_args; - active_args.activation_type = args.output.activation.activation_type; - - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; - - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; - - datalen = (uint64_t)args.image.width * (uint64_t)args.image.height * - (uint64_t)args.image.channels; - datalen = align_to_x(datalen, 16); - input_address_phy = vaddr_to_paddr_driver(args.image.address); - output_address_phy = vaddr_to_paddr_driver(args.output.address); - DLOG << "input_phy:" << input_address_phy; - DLOG << "output_phy:" << output_address_phy; - - switch (args.input_data_type) { - case DATA_TYPE_FP16: { - switch (args.output_data_type) { - case DATA_TYPE_FP16: - data_cell_in = SIZE_FP16; - data_cell_out = SIZE_FP16; - cmd = CMD_FP16_TO_FP16; - break; - - case DATA_TYPE_FP32: - data_cell_in = SIZE_FP16; - data_cell_out = SIZE_FP32; - cmd = CMD_FP16_TO_FP32; - break; - - default: - break; - } - } break; - - case DATA_TYPE_INT8: { - if (args.output_data_type != DATA_TYPE_FP16) { - DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: " - << args.output_data_type; - } - data_cell_in = SIZE_INT8; - data_cell_out = SIZE_FP16; - cmd = CMD_INT8_TO_FP16; - } break; - - case DATA_TYPE_FP32: { - switch (args.output_data_type) { - case DATA_TYPE_FP16: - data_cell_in = SIZE_FP32; - data_cell_out = SIZE_FP16; - cmd = CMD_FP32_TO_FP16; - break; - - case DATA_TYPE_FP32: - data_cell_in = SIZE_FP32; - data_cell_out = SIZE_FP32; - cmd = CMD_FP32_TO_FP32; - break; - - default: - break; - } - } break; - - default: - break; - } - if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 && - cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 && - cmd != CMD_INT8_TO_FP16) { - // std::cout<< " err back Error1!" <mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) { - ret = -EIO; - DLOG << "Bypass Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR); - reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR); - reg_writeq(datalen, REG_CONVERT_LENGTH); - reg_writeq(cmd, REG_CONVERT_CMD); - DLOG << "before reg poll"; - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR; - ret = -EIO; - DLOG << "BYPASS Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "BYPASS Wait Irq Timeout!"); - } - DLOG << "after reg poll"; - - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; -#endif - return 0; -} // PerformBypass - -uint64_t FPGAVersion() { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaBypass==========="; -#endif -#ifdef PADDLE_MOBILE_ZU5 - uint64_t fpga_ver = 0; - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - fpga_ver = reg_readq(REG_HARDWARE_STATUS); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return fpga_ver; -#endif - return 0; -} // FPGAVersion - -int ComputeFPGAConcat(const struct ConcatArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaConcat==========="; - DLOG << " Image_num: " << args.image_num - << " out_address:" << args.image_out - << " out_scale_address:" << args.scale_out - << " out_channel:" << args.out_channel; - DLOG << " image_height:" << args.height << " image_width:" << args.width; - for (int i = 0; i < args.image_num; i++) { - DLOG << " " << i << "th: "; - DLOG << " channel_num:" - << args.channel_num[i] - //<< " aligned_channel_num:" << args.aligned_channel_num[i] - << " image_address:" << args.images_in[i] - << " image_scale_address:" << args.scales_in[i]; - } -#endif - - image::concat_images(args.images_in, args.scales_in, args.image_out, - args.scale_out, args.image_num, args.channel_num, - args.height, args.width); - return 0; -} // ComputeFPGAConcat - -void deconv_post_process(const struct DeconvArgs &args) { - int sub_conv_n = args.sub_conv_num; - int sub_height = args.sub_output_height; - int sub_width = args.sub_output_width; - int omit_size = args.omit_size; - int channel = args.filter_num; - int num = 1; - int origin_h = sub_height * sub_conv_n; - int origin_w = sub_width * sub_conv_n; - int align_origin_w = align_to_x(origin_w * channel, 16); - int deconv_h = origin_h - 2 * omit_size; - int deconv_w = origin_w - 2 * omit_size; - int deconv_row_len = deconv_w * channel; - int align_deconv_row_len = align_to_x(deconv_row_len, 16); - - for (int idx = 0; idx < sub_conv_n; ++idx) { - paddle_mobile::fpga::fpga_invalidate( - args.split_conv_args[idx]->output.address, - align_origin_w * origin_h * sizeof(int16_t)); - } - - int deconv_idx = 0; - for (int nn = 0; nn < num; ++nn) { - for (int hh = 0; hh < origin_h; ++hh) { - int hx = (hh % sub_conv_n); - auto sub_t = - (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1] // NOLINT - ->output.address); - int hi = (hh / sub_conv_n); - if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; - int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + - omit_size * channel); - fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT - sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT - deconv_idx += align_deconv_row_len; - } - } - fpga_flush(args.output.address, - num * align_deconv_row_len * deconv_h * sizeof(int16_t)); -} -void DWDeconv_post_process(const struct DWDeconvArgs &args) { - int sub_conv_n = args.sub_conv_num; - int sub_height = args.sub_output_height; - int sub_width = args.sub_output_width; - int omit_size = args.omit_size; - int channel = args.filter_num; - int num = 1; - int origin_h = sub_height * sub_conv_n; - int origin_w = sub_width * sub_conv_n; - int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT); - int deconv_h = origin_h - 2 * omit_size; - int deconv_w = origin_w - 2 * omit_size; - int deconv_row_len = deconv_w * channel; - int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT); - - for (int idx = 0; idx < sub_conv_n; ++idx) { - paddle_mobile::fpga::fpga_invalidate( - args.dw_conv_args[idx]->output.address, - align_origin_w * origin_h * sizeof(int16_t)); - } - - int deconv_idx = 0; - for (int nn = 0; nn < num; ++nn) { - for (int hh = 0; hh < origin_h; ++hh) { - int hx = (hh % sub_conv_n); - auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1] // NOLINT - ->output.address); - int hi = (hh / sub_conv_n); - if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; - int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + - omit_size * channel); - fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT - sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT - deconv_idx += align_deconv_row_len; - } - } - fpga_flush(args.output.address, - num * align_deconv_row_len * deconv_h * sizeof(int16_t)); -} - -int ComputeFpgaDeconv(const struct DeconvArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFPGADeConv==========="; - DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num << "omit_size:" << args.omit_size - << "sub_output_width: " << args.sub_output_width - << "sub_output_height: " << args.sub_output_height - << " sub_conv_num:" << args.sub_conv_num; - DLOG << "args.output.address: " << args.output.address - << "args.output.scale_address: " << args.output.scale_address; - -#endif - - int sub_conv_num = args.sub_conv_num; - -#ifdef COST_TIME_PRINT - timeval start, end; - long dif_sec, dif_usec; // NOLINT -#endif - - for (int i = 0; i < sub_conv_num; i++) { -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - - ComputeFpgaConv(*args.split_conv_args[i]); -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv basic_conv: " << i << " times: " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - } - - if (sub_conv_num > 1) { - float max_scale = -1.0f; -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - for (int i = 0; i < sub_conv_num; i++) { - paddle_mobile::fpga::fpga_invalidate( - args.split_conv_args[i]->output.scale_address, 2 * sizeof(float)); - float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0]; - if (ptr_scale > max_scale) { - args.output.scale_address[0] = ptr_scale; - args.output.scale_address[1] = - (args.split_conv_args[i]->output.scale_address)[1]; - } - } - -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv scale " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - - // fpga_flush(args.output.scale_address, 2 * sizeof(float)); - /*#ifdef COST_TIME_PRINT - gettimeofday(&start,NULL); - #endif - //deconv_post_process(args); - #ifdef COST_TIME_PRINT - gettimeofday(&end,NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv_post_process " << " cost time: " << - (dif_sec*1000000+dif_usec) << "us" << std::endl; #endif*/ - } - - return 0; -} // ComputeFpgaDeconv - -int ComputeFPGASplit(const struct SplitArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaSplit==========="; - DLOG << " Image_num: " << args.image_num - << " in_address:" << args.image_in - << " in_scale_address:" << args.scale_in; - DLOG << " image_height:" << args.height << " image_width:" << args.width; - for (int i = 0; i < args.image_num; i++) { - DLOG << " " << i << "th: "; - DLOG << " channel_num:" << args.out_channel_nums[i] - << " image_address:" << args.images_out[i] - << " image_scale_address:" << args.scales_out[i]; - } -#endif - image::split_image(args.image_in, args.scale_in, args.images_out, - args.scales_out, args.image_num, args.out_channel_nums, - args.height, args.width); - return 0; -} // ComputeFPGASplit -int ComputeDWConv(const struct DWconvArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeDWConv==========="; - // DLOG << " mode:" << args.relu_enabled; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " filter_address:" << args.filter_address - << " bias_address:" << args.bias_address; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif -#ifdef PADDLE_MOBILE_ZU5 - DLOG << "DWConv"; - // return 0; - uint64_t output_scale = 0; - uint64_t timer_cnt = 0; - int ret = 0; - // uint64_t cmd = args.relu_enabled; - uint64_t cmd = 0; - uint64_t image_physical_address = 0; - uint64_t output_physical_address = 0; - uint64_t filter_physical_address = 0; - uint64_t bias_physical_address = 0; - - image_physical_address = vaddr_to_paddr(args.image.address); - output_physical_address = vaddr_to_paddr(args.output.address); - filter_physical_address = vaddr_to_paddr(args.filter_address); - bias_physical_address = vaddr_to_paddr(args.bias_address); - uint64_t filter_N_align = - align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t filter_amount_per_row_align = - filter_N_align * (uint64_t)args.kernel.width; - uint64_t sub_filter_amount_align = filter_N_align * - (uint64_t)args.kernel.width * - (uint64_t)args.kernel.height; - uint64_t filter_amount_align = - sub_filter_amount_align * (uint64_t)args.sub_conv_num; - - uint32_t output_height = (uint32_t)( - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint32_t output_width = (uint32_t)( - ((args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1) * - args.sub_conv_num); - - uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_one_pad_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT) + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - uint64_t image_two_pad_per_row = align_to_x( - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * - (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_row_mul_pooling_hight = - image_amount_per_row * (uint64_t)args.kernel.height; - uint64_t image_row_mul_pad_hight = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_row_mul_step_hight = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT); - uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; - uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - - uint64_t image_padleft_skipwindow = - (image_skip_window << 32) | image_pad_left; - - uint64_t reg_ActivationArgs = 0; - // active function:{none,leakeyrelu,sigmoid,tanh} - ActivationArgs active_args; - // active_args.activation_type = LEAKYRELU; - - active_args.activation_type = args.output.activation.activation_type; - - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; - - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; - - DLOG << " activation_type:" << active_args.activation_type - << " leaky_relu_negative_slope:" - << active_args.leaky_relu_negative_slope; - DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { - ret = -EIO; - DLOG << "DWConv Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - /*restart scale*/ - reg_writeq(output_scale, REG_SCALE_PARAMETER); - - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); - reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq((bias_physical_address << 32 | filter_physical_address), - REG_DWCONV_FILTER_BASE_ADDR); - reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32), - REG_DWCONV_FILTER_SHAPE); - reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32), - REG_DWCONV_FILTER_SUBNUMBER); - reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN); - - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_POOLING_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_POOLING_WINDOW_SIZE); - - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), - REG_POOLING_RESULT_PIXEL); - - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_POOLING_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_POOLING_STEP_PIXEL); - - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - - reg_writeq(image_row_mul_pooling_hight, - REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); - reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); - reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); - reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); - - /*SDK刷Cache保证数据一致性*/ - - reg_writeq(cmd, REG_DWCONV_CMD); - - DLOG << "before reg poll"; - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; - ret = -EIO; - DLOG << "Pooling Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout"); - } - DLOG << "after reg poll"; - - // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - DLOG << "output_scale:" << output_scale; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; -#endif - return 0; -} -int ComputeDWDeconv(const struct DWDeconvArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFPGADeConv==========="; - DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num << "omit_size:" << args.omit_size - << "sub_output_width: " << args.sub_output_width - << "sub_output_height: " << args.sub_output_height - << " sub_conv_num:" << args.sub_conv_num; - DLOG << "args.output.address: " << args.output.address - << "args.output.scale_address: " << args.output.scale_address; - -#endif - - int sub_conv_num = args.sub_conv_num; - -#ifdef COST_TIME_PRINT - timeval start, end; - long dif_sec, dif_usec; // NOLINT -#endif - - for (int i = 0; i < sub_conv_num; i++) { -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - - ComputeDWConv(*args.dw_conv_args[i]); -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv basic_conv: " << i << " times: " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - } - - if (sub_conv_num > 1) { - float max_scale = -1.0f; -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - for (int i = 0; i < sub_conv_num; i++) { - paddle_mobile::fpga::fpga_invalidate( - args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float)); - float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0]; - if (ptr_scale > max_scale) { - args.output.scale_address[0] = ptr_scale; - args.output.scale_address[1] = - (args.dw_conv_args[i]->output.scale_address)[1]; - } - } - -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv scale " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - } - -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - DWDeconv_post_process(args); -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv_post_process " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - return 0; -} // ComputeFpgaDeconv - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/api.cpp b/mobile/src/fpga/V2/api.cpp deleted file mode 100644 index 1a90cb5bdc..0000000000 --- a/mobile/src/fpga/V2/api.cpp +++ /dev/null @@ -1,1011 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V2/api.h" -#include -#include "fpga/V2/bias_scale.h" -#include "fpga/V2/deconv_filter.h" -#include "fpga/V2/filter.h" -#include "fpga/V2/image.h" - -namespace paddle_mobile { -namespace fpga { - -#define USE_RELU 1 -#define USE_BIAS 2 - -void format_image(framework::Tensor *image_tensor) { - auto dims = image_tensor->dims(); - auto channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = image_tensor->data(); - auto external_ptr = reinterpret_cast(image_tensor->external_data); - int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - - image::format_image(&p_data, channel, height, width); - if (p_data != data_ptr) { - image_tensor->reset_data_ptr(p_data); - } -} - -void format_ofm(framework::Tensor *ofm_tensor) { - if (ofm_tensor->type() == type_id()) { - format_fp32_ofm(ofm_tensor); - } else { - format_int8_ofm(ofm_tensor); - } -} - -void format_int8_ofm(framework::Tensor *ofm_tensor) { - auto dims = ofm_tensor->dims(); - size_t memory_size = 0; - if (dims.size() == 4) { - auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1], - height = dims[2], width = dims[3]; - memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * - sizeof(int8_t); - } else if (dims.size() == 2) { - auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1]; - memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t); - } else { - DLOG << "Wrong ofm dimension"; - } - auto p = fpga_malloc(memory_size); - ofm_tensor->reset_data_ptr(p); - ofm_tensor->set_type(type_id().hash_code()); - ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t); - fpga::fpga_flush(p, memory_size); -} - -void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { - size_t memory_size = 0; - if (dims.size() == 4) { - auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1], - height = dims[2], width = dims[3]; - memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * - sizeof(int8_t); - } else if (dims.size() == 2) { - auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1]; - memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t); - } else { - DLOG << "Wrong ofm dimension"; - } - auto p = fpga_malloc(memory_size); - ofm_tensor->reset_data_ptr(p); - ofm_tensor->set_type(type_id().hash_code()); - ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t); - fpga::fpga_flush(p, memory_size); -} - -void format_fp32_ofm(framework::Tensor *ofm_tensor) { - auto dims = ofm_tensor->dims(); - size_t memory_size = 0; - if (dims.size() == 4) { - auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1], - height = dims[2], width = dims[3]; - memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * - sizeof(float); - } else if (dims.size() == 2) { - auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1]; - memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(float); - } else { - DLOG << "Wrong ofm dimension"; - } - auto p = fpga_malloc(memory_size); - ofm_tensor->reset_data_ptr(p); - ofm_tensor->set_type(type_id().hash_code()); - ofm_tensor->fpga_data_num = memory_size / sizeof(float); - fpga::fpga_flush(p, memory_size); -} - -float filter_find_max(framework::Tensor *filter_tensor) { - auto filter_ptr = filter_tensor->data(); - return filter::find_max(filter_ptr, filter_tensor->numel()); -} - -int get_plit_num(framework::Tensor *filter_tensor) { - auto dims = filter_tensor->dims(); - auto chw = dims[1] * dims[2] * dims[3]; - auto num = dims[0]; - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_split_num(num, div_capacity); -} -int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) { - auto dims = filter_tensor->dims(); - auto chw = dims[1] * dims[2] / stride * dims[3] / stride; - auto num = dims[0] * stride; - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_split_num(num, div_capacity); -} - -int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) { - auto dims = filter_tensor->dims(); - auto chw = dims[1] * dims[2] * dims[3]; - auto num = dims[0]; - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_num_per_div(num, group_num, div_capacity); -} - -int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor, - int group_num, int stride) { - auto dims = filter_tensor->dims(); - auto chw = dims[1] * dims[2] / stride * dims[3] / stride; - auto num = dims[0] * stride; - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_num_per_div(num, group_num, div_capacity); -} - -int get_aligned_filter_element_num(int chw) { - return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); -} - -void format_filter(framework::Tensor *filter_tensor, float max_value, - int group_num) { - filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT - filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT - auto dims = filter_tensor->dims(); - auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * channel * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - filter::format_filter(&new_data, num, channel, height, width, group_num, - max_value); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} -void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { - auto dims = filter_tensor->dims(); - auto num = dims[0], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} - -void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, - int stride) { - auto dims = filter_tensor->dims(); - auto num = dims[0], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - - int hw = height * width; - deconv_filter::deconv_NC_convert(&new_data, num, 1, hw); - - num = dims[1]; - int channel = dims[0]; - - deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width, - scale_ptr, stride); - - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} - -void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { - filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT - filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT - auto dims = filter_tensor->dims(); - auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * channel * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - filter::format_fc_filter(&new_data, num, channel, height, width, 1, - max_value); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} -void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, - int group_num, int stride) { - filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT - filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT - auto dims = filter_tensor->dims(); - auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->data(); - size_t memory_size = num * channel * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - memcpy(new_data, data_ptr, memory_size); - - int hw = height * width; - deconv_filter::deconv_NC_convert(&new_data, num, channel, hw); - - num = dims[1]; - channel = dims[0]; - deconv_filter::deconv_format_filter( - &new_data, (int)num, (int)channel, // NOLINT - (int)height, // NOLINT - (int)width, group_num, max_value, stride); // NOLINT - - framework::DDim dims_new = - framework::make_ddim({num, channel, height, width}); - filter_tensor->Resize(dims_new); - filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(type_id().hash_code()); -} - -void format_bias_scale_array(float **bias_scale_array, - int element_num_per_division, int num) { - bias_scale::format_bias_scale_array(bias_scale_array, - element_num_per_division, num); -} -void format_bias_array(float **bias_array, int num) { - bias_scale::format_bias_array(bias_array, num); -} - -void format_concat_output(framework::Tensor *out, int height, int width, - int image_num, uint32_t *channel_num) { - int sum_channel = 0, sum_cw = 0; - for (int i = 0; i < image_num; i++) { - sum_channel += channel_num[i]; - } - - sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT); - auto data_ptr = fpga_malloc(height * sum_cw * sizeof(int8_t)); - auto ddim = framework::make_ddim({1, sum_channel, height, width}); - out->Resize(ddim); - out->reset_data_ptr(data_ptr); - out->set_type(type_id().hash_code()); -} -void format_conv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float **bs_ptr, - int group) { - float max_value = fpga::filter_find_max(filter_tensor); - fpga::format_filter(filter_tensor, max_value, group); - int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group); - fpga::format_bias_scale_array(bs_ptr, element_num_per_div, - ofm_tensor->dims()[1]); - fpga::format_ofm(ofm_tensor); -} -void format_deconv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float **bs_ptr, - int group, int sub_conv_n) { - int channel = ofm_tensor->dims()[1]; - float max_value = filter_find_max(filter_tensor); - format_deconv_filter(filter_tensor, max_value, group, sub_conv_n); - int element_num_per_div = - get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n); - format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n); - format_ofm(ofm_tensor); -} - -void format_dwconv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float *scale_ptr, - float **bias_ptr) { - auto channel = ofm_tensor->dims()[1]; - format_dwconv_filter(filter_tensor, scale_ptr); - format_bias_array(bias_ptr, channel); - format_ofm(ofm_tensor); -} -void format_DWDeconv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float **bs_ptr, - int group, int sub_conv_n) { - int channel = ofm_tensor->dims()[1]; - format_DWDconv_filter( - filter_tensor, - (reinterpret_cast(*bs_ptr) + sub_conv_n * channel), sub_conv_n); - format_bias_array(bs_ptr, channel); - format_ofm(ofm_tensor); -} - -void expand_conv_arg(ConvArgs *arg) { - ConvArgs args = *arg; - - auto fpga_bias_scale_len = - align_to_x(args.filter_num / args.group_num, BS_NUM_ALIGNMENT) * - args.group_num; - fpga_bias_scale_len = fpga_bias_scale_len / BIAS_SCALE_DMA_NUM; - - auto output_height = - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1; - auto output_width = - (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1; - - auto filter_per_group = args.filter_num / args.group_num; - auto channel_per_group = args.image.channels / args.group_num; - - auto image_row_count = args.image.width * args.image.channels; - auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT); - auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) + - args.image.pad_width * args.image.channels; - auto filter_amount_all = - align_to_x(args.kernel.height * args.kernel.width * channel_per_group, - FILTER_ELEMENT_ALIGNMENT); - - auto output_amount_per_row = align_to_x( - (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num, - RESULT_ALIGNMENT); - - // find the opt partition strategy - uint64_t res_win; - uint64_t res_fit = 0; - for (res_win = 1; res_win <= output_width; res_win++) { - if ((align_to_x( - (args.image.channels * - (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), - IMAGE_ALIGNMENT) / - IMAGE_ALIGNMENT + - 1) * - args.kernel.height > - 256) { - break; - } - } - - if (res_win != output_width) { - res_win -= 1; - } - - if (((res_win % 2) != 0) && (res_win != 1)) { - res_win = res_win - 1; - } - // PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume"); - res_fit = res_win; - - auto block_num = (output_width + res_fit - 1) / res_fit; - auto block_len = res_fit; - auto block_last = output_width - res_fit * (block_num - 1); - - auto res_amount_per_row = - (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num; - auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; - - auto image_block_amount_per_row = - args.kernel.stride_w * res_fit * args.image.channels; - auto filter_pad_width_mul_channel = - args.image.pad_width * args.image.channels; - auto image_amount_per_row_multi_win_first = - image_amount_per_row * - (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height); - auto image_amount_per_row_multi_win = - image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h); - - auto image_block_num = block_num; - auto image_block_len = - align_to_x((args.image.channels * - (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), - IMAGE_ALIGNMENT) / - IMAGE_ALIGNMENT + - 1; - auto image_block_len_last = - align_to_x( - (args.image.channels * - (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), - IMAGE_ALIGNMENT) / - IMAGE_ALIGNMENT + - 1; - auto image_win_cnt = block_len; - auto image_win_cnt_last = block_last; - auto res_row_data_align4_pad = res_amount_per_row_pad / 8; - auto prog_full_cnt = 1024 / (filter_amount_all / 16 * 2) - 1; - if (prog_full_cnt == 511) { - prog_full_cnt--; - } - auto post_prog_full_cnt = - (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) - ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) - : 0; - auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; - // auto cmd = 0UL | USE_BIAS; - - auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) | - ((args.deconv_tx_param.sub_conv_num) << 8) | - ((args.deconv_tx_param.omit_size) << 0); - - (*arg).driver.filter_per_group = filter_per_group; - (*arg).driver.channel_per_group = channel_per_group; - (*arg).driver.image_one_pad_per_row = image_one_pad_per_row; - (*arg).driver.deconv_param = deconv_param; - // new - (*arg).driver.col_padding_up = args.image.pad_width * args.image.channels; - (*arg).driver.col_padding_down = image_one_pad_per_row; - (*arg).driver.row_padding_up = args.image.pad_height; - (*arg).driver.row_padding_down = args.image.pad_height + args.image.height; - (*arg).driver.image_block_amount_per_row = image_block_amount_per_row; - (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel; - (*arg).driver.image_win_cnt = image_win_cnt; - (*arg).driver.image_win_cnt_last = image_win_cnt_last; - (*arg).driver.filter_row = args.kernel.width * args.image.channels; - (*arg).driver.filter_width = args.kernel.width; - (*arg).driver.filter_height = args.kernel.height; - (*arg).driver.skip_window = args.image.channels * args.kernel.stride_w; - (*arg).driver.stride_h = args.kernel.stride_h; - (*arg).driver.filter_amount_all = filter_amount_all; - (*arg).driver.prog_full_cnt = prog_full_cnt; - (*arg).driver.filter_align = args.filter_num / (4 * PE_COLUMN) + - (((args.filter_num % (4 * PE_COLUMN))) ? 1 : 0); - (*arg).driver.filter_num = args.filter_num; - (*arg).driver.output_width = output_width; - (*arg).driver.output_amount_per_row = output_amount_per_row; - (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad; - (*arg).driver.cal_res_num = output_height / ROW_PARALLEL_NUM + - ((output_height % ROW_PARALLEL_NUM) ? 1 : 0) - 1; - (*arg).driver.last_cal_res_row_num = - (output_height % (ROW_PARALLEL_NUM)) - ? (output_height % (ROW_PARALLEL_NUM)) - : (ROW_PARALLEL_NUM); - - (*arg).driver.post_prog_full_cnt = post_prog_full_cnt; - (*arg).driver.deconv_skip_row = - ROW_PARALLEL_NUM * - args.deconv_tx_param.sub_conv_num; // paralvl*deconv_group - (*arg).driver.deconv_res_skip_row = - args.deconv_tx_param.sub_conv_num * - output_amount_per_row; // deconv_group * result_amount_per_row - (*arg).driver.deconv_ena = args.deconv_tx_param.deconv_en; - (*arg).driver.deconv_dump = args.deconv_tx_param.omit_size; - (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) + - args.deconv_tx_param.out_addr_offset; - (*arg).driver.output_height = output_height; - (*arg).driver.result_amount_per_row_multi_para = - output_amount_per_row / RESULT_ALIGNMENT * - (args.deconv_tx_param.deconv_en ? (*arg).driver.deconv_skip_row - : ROW_PARALLEL_NUM); - (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); - (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len; - (*arg).driver.filter_amount_whole = filter_amount_all; - (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address); - (*arg).driver.filters_amount_whole = - filter_amount_all * (*arg).driver.filter_align * (4 * PE_COLUMN); - (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); - (*arg).driver.image_hight = args.image.height; - (*arg).driver.image_amount_per_row = image_amount_per_row; - (*arg).driver.image_amount_per_row_multi_win_first = - image_amount_per_row_multi_win_first; - (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win; - (*arg).driver.filter_pad_hight = args.image.pad_height; - (*arg).driver.image_block_num = image_block_num; - (*arg).driver.image_block_len = image_block_len; - (*arg).driver.image_block_len_last = image_block_len_last; - - (*arg).driver.cmd = cmd; -} // expand_conv_arg() - -void expand_EW_arg(EWAddArgs *arg) { - EWAddArgs args = *arg; - uint64_t cmd = args.relu_enabled ? USE_RELU : 0; - uint64_t datalen = (uint64_t)args.image0.width * - (uint64_t)args.image0.height * - (uint64_t)args.image0.channels; - uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; - uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address); - uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address); - uint64_t output_address_phy = vaddr_to_paddr(args.output.address); - - uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, - IMAGE_ALIGNMENT); - uint64_t image_amount_per_row_p = align_to_x( - (uint64_t)args.image0.width * (uint64_t)args.image0.channels, 16); - uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | - ((uint64_t)args.image0.width << 16) | - (uint64_t)args.image0.height; - - (*arg).driver.image0_address_phy = image0_address_phy; - (*arg).driver.image1_address_phy = image1_address_phy; - (*arg).driver.datalen = datalen; - (*arg).driver.image_image_pixel = image_image_pixel; - (*arg).driver.image_amount_per_row = - (uint64_t)image_amount_per_row | (uint64_t)(image_amount_per_row_p << 32); - (*arg).driver.output_address_phy = output_address_phy; - (*arg).driver.coefficient = coefficient; - (*arg).driver.cmd = cmd; -} // expand_EW_arg - -void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - bool relu_enabled, int group_num, int stride_h, - int stride_w, int padding_h, int padding_w, float *bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); - auto out_ptr = out->data(); - auto deleter = [](void *p) { fpga_free(p); }; - - arg->group_num = (uint32_t)group_num; - // Either group_num or split_num = 1; - PADDLE_MOBILE_ENFORCE(group_num == 1, "group_num is not equal to 1"); - arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1; - arg->filter_num = (uint32_t)filter->dims()[0]; - arg->output.address = out_ptr; - arg->output.scale_address = out->scale; - arg->conv_arg = - (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT - - arg->shared_conv_arg = std::shared_ptr(arg->conv_arg, deleter); - - memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs)); - - arg->concat_arg.image_num = arg->split_num; - arg->concat_arg.image_out = out_ptr; - arg->concat_arg.scale_out = out->scale; - arg->concat_arg.height = (uint32_t)out->dims()[2]; - arg->concat_arg.width = (uint32_t)out->dims()[3]; - - int n = arg->split_num; - arg->concat_arg.images_in = - static_cast(fpga_malloc(n * sizeof(int *))); - arg->concat_arg.scales_in = - static_cast(fpga_malloc(n * sizeof(float *))); - arg->concat_arg.channel_num = - static_cast(fpga_malloc(n * sizeof(uint32_t))); - arg->vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(arg->concat_arg.images_in), deleter)); - arg->vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(arg->concat_arg.scales_in), deleter)); - arg->vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(arg->concat_arg.channel_num), deleter)); - - auto channel = (int)out->dims()[1]; // NOLINT - int filter_num_per_div = get_filter_num_per_div(filter, group_num); - int element_num = get_aligned_filter_element_num( - (int)(filter->dims()[1] * filter->dims()[2] * // NOLINT - filter->dims()[3])); - - for (int i = 0; i < n; i++) { - arg->conv_arg[i].relu_enabled = relu_enabled; - arg->conv_arg[i].group_num = (uint32_t)group_num; - arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; - arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; - arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2]; - arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3]; - arg->conv_arg[i].image.address = input_ptr; - arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1]; - arg->conv_arg[i].image.height = (uint32_t)input->dims()[2]; - arg->conv_arg[i].image.width = (uint32_t)input->dims()[3]; - arg->conv_arg[i].image.scale_address = input->scale; - arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; - arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; - arg->conv_arg[i].filter_scale_address = filter->scale; - arg->conv_arg[i].filter_num = (uint32_t)( - i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT - : filter_num_per_div); - - size_t filter_size = - element_num * - align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) * - sizeof(int8_t); - auto filter_head = &( - (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT - arg->conv_arg[i].filter_address = fpga_malloc(filter_size); - arg->vector_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->conv_arg[i].filter_address), deleter)); - memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); - fpga_flush(arg->conv_arg[i].filter_address, filter_size); - - size_t bs_size = 2 * - align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) * - sizeof(float); - auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; - arg->conv_arg[i].sb_address = fpga_malloc(bs_size); - arg->vector_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->conv_arg[i].sb_address), deleter)); - memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); - fpga_flush(arg->conv_arg[i].sb_address, bs_size); - - if (n > 1) { - arg->conv_arg[i].output.scale_address = - static_cast(fpga_malloc(2 * sizeof(float))); - arg->conv_arg[i].output.address = - fpga_malloc(out->dims()[2] * - align_to_x((int)(out->dims()[3] * // NOLINT - arg->conv_arg[i].filter_num), - IMAGE_ALIGNMENT) * - sizeof(int8_t)); - arg->vector_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->conv_arg[i].output.scale_address), - deleter)); - arg->vector_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->conv_arg[i].output.address), deleter)); - } else { - arg->conv_arg[i].output.scale_address = out->scale; - arg->conv_arg[i].output.address = out_ptr; - } - - arg->concat_arg.images_in[i] = - (int8_t *)arg->conv_arg[i].output.address; // NOLINT - arg->concat_arg.scales_in[i] = out->scale; - arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; - - expand_conv_arg(&arg->conv_arg[i]); - } - filter->reset_data_ptr(nullptr); - fpga_free(bs_ptr); -} // fill_split_arg - -void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - bool relu_enabled, int group_num, int stride_h, - int stride_w, int padding_h, int padding_w, - float *bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); - auto deleter = [](void *p) { fpga_free(p); }; - - arg->group_num = (uint32_t)group_num; - arg->sub_conv_num = (uint32_t)stride_h; - arg->filter_num = (uint32_t)filter->dims()[0]; - uint32_t sub_conv_num = arg->sub_conv_num; - int sub_pad = - deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], // NOLINT - padding_w, stride_w); - auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis( - (int)filter->dims()[3], stride_w); // NOLINT - - auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[3], sub_pad, sub_filter_width); // NOLINT - auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[2], sub_pad, sub_filter_width); // NOLINT - - arg->sub_output_width = (uint32_t)sub_output_width; - arg->sub_output_height = (uint32_t)sub_output_height; - arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( - stride_w, (int)filter->dims()[3], padding_w); // NOLINT - - auto sub_channels = (int)input->dims()[1]; // NOLINT - uint32_t omit_size = arg->omit_size; - int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; - int sub_filter_num = sub_conv_num * (arg->filter_num); - - framework::DDim dims_out_new = framework::make_ddim( - {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width}); - fpga::format_int8_ofm(out, dims_out_new); - auto out_ptr = out->data(); - arg->output.address = - (int8_t *)out_ptr + // NOLINT - omit_size * sizeof(int8_t) * - (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); - arg->output.scale_address = out->scale; - - uint32_t conv_output_size = - (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) * - sub_output_height; - uint32_t split_num = - group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1; - - for (int i = 0; i < sub_conv_num; ++i) { - arg->split_conv_args.push_back(std::make_shared()); - arg->split_conv_args[i]->filter_num = - (arg->sub_conv_num) * (arg->filter_num); - arg->split_conv_args[i]->group_num = (uint32_t)group_num; - arg->split_conv_args[i]->split_num = split_num; - arg->split_conv_args[i]->concat_arg.height = sub_output_height; - arg->split_conv_args[i]->concat_arg.width = sub_output_width; - arg->split_conv_args[i]->concat_arg.image_num = split_num; - - arg->split_conv_args[i]->conv_arg = - static_cast(fpga_malloc(split_num * sizeof(ConvArgs))); - arg->split_conv_args[i]->concat_arg.images_in = - static_cast(fpga_malloc(split_num * sizeof(int8_t *))); - arg->split_conv_args[i]->concat_arg.scales_in = - static_cast(fpga_malloc(split_num * sizeof(float *))); - arg->split_conv_args[i]->concat_arg.channel_num = - static_cast(fpga_malloc(split_num * sizeof(uint32_t))); - arg->split_conv_args[i]->shared_conv_arg = - std::shared_ptr(arg->split_conv_args[i]->conv_arg, deleter); - arg->split_conv_args[i]->vector_concat_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->concat_arg.images_in), - deleter)); - arg->split_conv_args[i]->vector_concat_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->concat_arg.scales_in), - deleter)); - arg->split_conv_args[i]->vector_concat_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->concat_arg.channel_num), - deleter)); - } - - auto filter_num_per_div = - (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w); - int element_num = get_aligned_filter_element_num( - (int)(sub_channels * sub_filter_width * sub_filter_width)); // NOLINT - - int chw = sub_channels * sub_filter_width * sub_filter_width; - int division_capacity = filter::calc_division_capacity(chw); - int num_per_div_before_alignment = - filter::calc_num_per_div(sub_filter_num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = (sub_filter_num + num_per_div_before_alignment - 1) / - num_per_div_before_alignment; - int residual = sub_filter_num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - - int filter_sub_conv_offset = element_num * num_after_alignment; - uint32_t out_addr_offset = 0; - for (int i = 0; i < sub_conv_num; ++i) { - if (sub_conv_num == 1) { - arg->split_conv_args[i]->output.address = arg->output.address; - arg->split_conv_args[i]->output.scale_address = arg->output.scale_address; - out_addr_offset = 0; - - } else { - out_addr_offset = - sizeof(int8_t) * (sub_conv_num - 1 - i) * - (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); - - arg->split_conv_args[i]->output.address = out_ptr; - arg->split_conv_args[i]->output.scale_address = - static_cast(fpga_malloc(2 * sizeof(float))); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->output.scale_address), - deleter)); - } - - for (int j = 0; j < split_num; ++j) { - // arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type - // = - // activation_enable; - // arg->split_conv_args[i] - // ->conv_arg[j] - // .output.activation.leaky_relu_negative_slope = - // leaky_relu_negative_slope; - arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled; - arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num; - - arg->split_conv_args[i]->conv_arg[j].kernel.width = - (uint32_t)sub_filter_width; - arg->split_conv_args[i]->conv_arg[j].kernel.height = - (uint32_t)sub_filter_width; - arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1; - arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1; - - arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1; - arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num = - sub_conv_num; - arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size = - omit_size; - arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset = - out_addr_offset; - - arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale; - arg->split_conv_args[i]->conv_arg[j].image.channels = - (uint32_t)sub_channels; - arg->split_conv_args[i]->conv_arg[j].image.width = - (uint32_t)input->dims()[3]; - arg->split_conv_args[i]->conv_arg[j].image.height = - (uint32_t)input->dims()[2]; - arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad; - arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad; - arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr; - - arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale; - arg->split_conv_args[i]->conv_arg[j].filter_num = - (uint32_t)(j == split_num - 1 - ? sub_filter_num - (split_num - 1) * filter_num_per_div - : filter_num_per_div); - - size_t filter_size = - element_num * - align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num, - FILTER_NUM_ALIGNMENT) * - sizeof(int8_t); - auto filter_head = &(( - int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT - i * filter_sub_conv_offset]; - arg->split_conv_args[i]->conv_arg[j].filter_address = - fpga_malloc(filter_size); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->conv_arg[j].filter_address), - deleter)); - - memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head, - filter_size); - fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address, - filter_size); - - size_t bs_align_num = align_to_x( - arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT); - size_t bs_size = 2 * bs_align_num * sizeof(float); - auto bs_head = &bs_ptr[j * filter_num_per_div * 2]; - - arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->conv_arg[j].sb_address), - deleter)); - - memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size); - fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size); - - if (split_num == 1) { - arg->split_conv_args[i]->conv_arg[j].output.address = - arg->split_conv_args[i]->output.address; - arg->split_conv_args[i]->conv_arg[j].output.scale_address = - arg->split_conv_args[i]->output.scale_address; - } else { - arg->split_conv_args[i]->conv_arg[j].output.address = - fpga_malloc(conv_output_size * sizeof(int8_t)); - arg->split_conv_args[i]->conv_arg[j].output.scale_address = - static_cast(fpga_malloc(2 * sizeof(float))); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->conv_arg[j].output.address), - deleter)); - arg->split_conv_args[i]->vector_conv_space.push_back( - std::shared_ptr( - reinterpret_cast( - arg->split_conv_args[i]->conv_arg[j].output.scale_address), - deleter)); - } - arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast( - arg->split_conv_args[i]->conv_arg[j].output.address); - arg->split_conv_args[i]->concat_arg.scales_in[j] = - arg->split_conv_args[i]->conv_arg[j].output.scale_address; - arg->split_conv_args[i]->concat_arg.channel_num[j] = - arg->split_conv_args[i]->conv_arg[j].filter_num; - - expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j])); - } - - arg->split_conv_args[i]->concat_arg.image_out = - arg->split_conv_args[i]->output.address; - arg->split_conv_args[i]->concat_arg.scale_out = - arg->split_conv_args[i]->output.scale_address; - } - filter->reset_data_ptr(nullptr); - fpga_free(bs_ptr); -} // fill_deconv_arg - -void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - bool relu_enabled, int stride_h, int stride_w, - int padding_h, int padding_w, float *bias_ptr) { - auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - auto output_ptr = out->data(); - arg->sub_conv_num = 1; - arg->relu_enabled = relu_enabled; - // arg->output.activation.activation_type = activation_enable; - arg->bias_address = bias_ptr; - arg->filter_address = filter_ptr; - arg->kernel.height = (uint32_t)filter->dims()[2]; - arg->kernel.width = (uint32_t)filter->dims()[3]; - arg->kernel.stride_h = (uint32_t)stride_h; - arg->kernel.stride_w = (uint32_t)stride_w; - arg->image.address = input_ptr; - arg->image.channels = (uint32_t)input->dims()[1]; - arg->image.height = (uint32_t)input->dims()[2]; - arg->image.width = (uint32_t)input->dims()[3]; - arg->image.pad_height = (uint32_t)padding_h; - arg->image.pad_width = (uint32_t)padding_w; - arg->image.scale_address = input->scale; - arg->output.address = output_ptr; - arg->output.scale_address = out->scale; -} // end dwconv arg fill - -void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - bool relu_enabled, int stride_h, int stride_w, - int padding_h, int padding_w, float *bias_ptr) { - auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - - auto deleter = [](void *p) { fpga_free(p); }; - - arg->group_num = (uint32_t)filter->dims()[0]; - arg->sub_conv_num = (uint32_t)stride_w; - arg->filter_num = (uint32_t)filter->dims()[0]; - - int sub_conv_num = stride_w; - - int sub_pad = - deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], // NOLINT - padding_w, stride_w); - auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis( - (int)filter->dims()[3], stride_w); // NOLINT - - auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[3], sub_pad, sub_filter_width); // NOLINT - auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[2], sub_pad, sub_filter_width); // NOLINT - - arg->sub_output_width = (uint32_t)sub_output_width; - arg->sub_output_height = (uint32_t)sub_output_height; - arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( - stride_w, (int)filter->dims()[3], padding_w); // NOLINT - - auto sub_channels = (int)input->dims()[1]; // NOLINT - uint32_t omit_size = arg->omit_size; - int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; - int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; - int sub_filter_num = sub_conv_num * (arg->filter_num); - - framework::DDim dims_out_new = framework::make_ddim( - {1, arg->filter_num, real_out_height, real_out_width}); - fpga::format_int8_ofm(out, dims_out_new); - auto out_ptr = out->data(); - - arg->output.address = out_ptr; - arg->output.scale_address = out->scale; - - int filter_offset = sub_filter_width * sub_filter_width * - align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) * - arg->sub_conv_num; - - for (int i = 0; i < sub_conv_num; ++i) { - arg->dw_conv_args.push_back(std::make_shared()); - - arg->dw_conv_args[i]->sub_conv_num = sub_conv_num; - arg->dw_conv_args[i]->relu_enabled = relu_enabled; - // arg->dw_conv_args[i]->output.activation.activation_type = - // activation_enable; - // arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope = - // leaky_relu_negative_slope; - arg->dw_conv_args[i]->bias_address = bias_ptr; - - arg->dw_conv_args[i]->filter_address = - fpga_malloc(filter_offset * sizeof(int16_t)); - memcpy(arg->dw_conv_args[i]->filter_address, - (reinterpret_cast(filter_ptr) + i * filter_offset), - filter_offset * sizeof(int16_t)); - arg->vector_dw_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->dw_conv_args[i]->filter_address), - deleter)); - - arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width; - arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width; - - arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1; - arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1; - arg->dw_conv_args[i]->image.address = input_ptr; - arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1]; - arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2]; - arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3]; - - arg->dw_conv_args[i]->image.pad_height = sub_pad; - arg->dw_conv_args[i]->image.pad_width = sub_pad; - arg->dw_conv_args[i]->image.scale_address = input->scale; - - arg->dw_conv_args[i]->output.address = - fpga_malloc(sub_output_height * - align_to_x(sub_output_width * sub_channels * sub_conv_num, - IMAGE_ALIGNMENT) * - sizeof(int8_t)); - arg->dw_conv_args[i]->output.scale_address = - static_cast(fpga_malloc(2 * sizeof(float))); - arg->vector_dw_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->dw_conv_args[i]->output.address), - deleter)); - arg->vector_dw_conv_space.push_back(std::shared_ptr( - reinterpret_cast(arg->dw_conv_args[i]->output.scale_address), - deleter)); - } - - // arg->output.scale_address = out->scale; -} // end dwconv arg fill - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/api.h b/mobile/src/fpga/V2/api.h deleted file mode 100644 index d8674c4401..0000000000 --- a/mobile/src/fpga/V2/api.h +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "fpga/common/fpga_common.h" -#include "fpga/common/pe.h" -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace fpga { - -void format_image(framework::Tensor* image_tensor); -void format_ofm(framework::Tensor* ofm_tensor); -void format_int8_ofm(framework::Tensor* ofm_tensor); -void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); -void format_fp32_ofm(framework::Tensor* ofm_tensor); - -float filter_find_max(framework::Tensor* filter_tensor); -int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); -int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor, - int group_num, int stride); - -int get_plit_num(framework::Tensor* filter_tensor); -int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride); - -int get_aligned_filter_element_num(int chw); -void format_filter(framework::Tensor* filter_tensor, float max_value, - int group_num); -void format_fc_filter(framework::Tensor* filter_tensor, float max_value); -void format_bias_scale_array(float** bias_scale_array, - int element_num_per_division, int num); -void format_bias_array(float** bias_array, int num); -void format_concat_output(framework::Tensor* out, int height, int width, - int image_num, uint32_t* channel_num); - -void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - bool relu_enabled, int group_num, int stride_h, - int stride_w, int padding_h, int padding_w, float* bs_ptr); -void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - bool relu_enabled, int group_num, int stride_h, - int stride_w, int padding_h, int padding_w, float* bs_ptr); -void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - bool relu_enabled, int stride_h, int stride_w, - int padding_h, int padding_w, float* bias_ptr); -void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - bool relu_enabled, int stride_h, int stride_w, - int padding_h, int padding_w, float* bs_ptr); - -void format_deconv_filter(framework::Tensor* filter_tensor, float max_value, - int group_num, int stride); -void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr); -void format_conv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float** bs_ptr, int group); -void format_deconv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float** bs_ptr, - int group, int sub_conv_n); -void format_dwconv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float* scale_ptr, - float** bias_ptr); -void format_DWDeconv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float** bs_ptr, - int group, int sub_conv_n); - -template -void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) { - float data; - std::ofstream out(filename.c_str()); - for (int i = 0; i < dataSize; ++i) { - data = (((Dtype*)buffer)[i]); // NOLINT - out << data << std::endl; - } - out.close(); - return; -} - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/bias_scale.cpp b/mobile/src/fpga/V2/bias_scale.cpp deleted file mode 100644 index 44722ef59a..0000000000 --- a/mobile/src/fpga/V2/bias_scale.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V2/bias_scale.h" -#include -#include -#include "fpga/common/fpga_common.h" - -namespace paddle_mobile { -namespace fpga { -namespace bias_scale { - -void align_element(float **data_in, int num_per_div_before_alignment, int num) { - int copynum = 0; - float *ptr_unaligned = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); - int num_element = - 2 * div_num * num_per_div_after_alignment; // including bias & scale - float *ptr_aligned = - (float *)fpga_malloc(num_element * sizeof(float)); // NOLINT - - memset(ptr_aligned, 0, num_element * sizeof(float)); - - for (int i = 0; i < div_num; i++) { - if (i == div_num - 1) { - copynum = (num_per_div_after_alignment * div_num > num) - ? (num % num_per_div_after_alignment) - : (num_per_div_before_alignment); - } else { - copynum = num_per_div_before_alignment; - } - - memcpy(ptr_aligned + i * num_per_div_after_alignment, - ptr_unaligned + num_per_div_before_alignment * i, - copynum * sizeof(float)); - memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment, - ptr_unaligned + num_per_div_before_alignment * i + num, - copynum * sizeof(float)); - } - - fpga_free(ptr_unaligned); - *data_in = ptr_aligned; -} - -void fixed_scale_bias_new(void *data_in, int data_len) { - int *data_tmp = static_cast(data_in); - for (int idx = 0; idx < data_len / 2; ++idx) { - float tmp = (static_cast(data_in))[idx]; - data_tmp[idx] = static_cast(round(tmp * pow(2.0, 23.0))); - tmp = (static_cast(data_in))[idx + data_len / 2]; - data_tmp[idx + data_len / 2] = - static_cast(round(tmp * pow(2.0, 30.0))); - } - return; -} - -void interleave(float **data_in, int num_after_alignment) { - // num_after_alignment: number of bias after alignment - - float *ptr_uninterleaved = *data_in; - // fixed_scale_bias_new(ptr_uninterleaved, 2 * num_after_alignment); - float *ptr_interleaved = - (float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT - int num = num_after_alignment / 4; - for (int i = 0; i < num; i++) { - memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, - 4 * sizeof(float)); - memcpy(ptr_interleaved + 8 * i + 4, - ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float)); - } - - fpga_free(ptr_uninterleaved); - *data_in = ptr_interleaved; -} - -void format_bias_scale_array(float **bias_scale_array, - int element_num_per_division, int num) { - align_element(bias_scale_array, element_num_per_division, num); - int div_num = (num + element_num_per_division - 1) / element_num_per_division; - int element_num_after_division = - align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); - interleave(bias_scale_array, div_num * element_num_after_division); - fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float)); -} -void format_bias_array(float **bias_array, int num) { - float *ptr_unaligned = *bias_array; - int num_before_align = num; - int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT); - int16_t *ptr_aligned = - (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT - - memset(ptr_aligned, 0, num_after_align * sizeof(int16_t)); - for (int i = 0; i < num_before_align; i++) { - ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]); - } - *bias_array = (float *)ptr_aligned; // NOLINT - fpga_free(ptr_unaligned); -} - -} // namespace bias_scale -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/bias_scale.h b/mobile/src/fpga/V2/bias_scale.h deleted file mode 100644 index 9ebdc71bce..0000000000 --- a/mobile/src/fpga/V2/bias_scale.h +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace fpga { -namespace bias_scale { - -void align_element(float** data_in, int num_per_div_before_alignment, int num); -void interleave(float** data_in, int num_after_alignment); -void format_bias_scale_array(float** bias_scale_array, - int element_num_per_division, int num); -void format_bias_array(float** bias_array, int num); - -} // namespace bias_scale -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/deconv_bias_scale.cpp b/mobile/src/fpga/V2/deconv_bias_scale.cpp deleted file mode 100644 index f88e1a7738..0000000000 --- a/mobile/src/fpga/V2/deconv_bias_scale.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V2/deconv_bias_scale.h" -// #include "deconv_bias_scale.h" -#include "fpga/V2/bias_scale.h" -// #include "bias_scale.h" -// #include - -#include "fpga/V2/api.h" -// #include "fpga_api.h" -namespace paddle_mobile { -namespace fpga { -namespace deconv_bias_scale { - -void deconv_bias_scale_expand(float** bias_scale_array, int num, - int sub_conv_n) { - int sub_num = num * sub_conv_n; - float* ptr_tmp = *bias_scale_array; - float* ptr_bias_scale_expand = - reinterpret_cast(fpga_malloc(sizeof(float) * sub_num * 2)); - int scale_base_offset = sub_num; - for (int i = 0; i < sub_conv_n; ++i) { - int offset = num * i; - // copy bias - fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float)); - // copy scale - fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num, - num * sizeof(float)); - } - *bias_scale_array = ptr_bias_scale_expand; - fpga_free(ptr_tmp); -} - -} // namespace deconv_bias_scale -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/deconv_bias_scale.h b/mobile/src/fpga/V2/deconv_bias_scale.h deleted file mode 100644 index 820c6984d4..0000000000 --- a/mobile/src/fpga/V2/deconv_bias_scale.h +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace fpga { -namespace deconv_bias_scale { - -void deconv_bias_scale_expand(float** bias_scale_array, int num, - int sub_conv_n); - -} // namespace deconv_bias_scale -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/deconv_filter.cpp b/mobile/src/fpga/V2/deconv_filter.cpp deleted file mode 100644 index 5ed9786f19..0000000000 --- a/mobile/src/fpga/V2/deconv_filter.cpp +++ /dev/null @@ -1,280 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V2/deconv_filter.h" -#include -#include -// #include "deconv_filter.h" -#include "fpga/V2/filter.h" -// #include "filter.h" -#include "fpga/V2/api.h" - -namespace paddle_mobile { -namespace fpga { -namespace deconv_filter { - -/* -inverse kernel weights of each channel for every filter -*/ -void deconv_inverse_filter(float** data_in, int num, int channel, int width, - int height) { - float* tmp = *data_in; - int data_size = num * channel * width * height; - int hw_len = height * width; - auto tmp_data = - reinterpret_cast(fpga_malloc(data_size * sizeof(float))); - for (int i = 0; i < num; ++i) { - for (int j = 0; j < channel; ++j) { - for (int k = 0; k < hw_len; ++k) { - tmp_data[i * channel * hw_len + j * hw_len + k] = - (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1]; - } - } - } - *data_in = tmp_data; - fpga_free(tmp); -} - -/* - calculate sub padding number -*/ -int deconv_calc_sub_pad(int filter_axis, int pad, int stride) { - if (stride == 0 || ((filter_axis - pad - 1) < 0)) { - PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters"); - } - return (filter_axis - pad - 1) / stride; -} -int deconv_get_sub_filter_axis(int filter_axis, int stride) { - return (filter_axis / stride); -} - -int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) { - return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1); -} - -/* - (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image - position. so the omit rows or columns is (stride - ) -*/ -int deconv_get_omit(int stride, int filter_width, int pad) { - PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters"); - int idx; - bool flag = false; - for (idx = 1; idx <= stride; ++idx) { - int j = idx; - for (; j <= filter_width;) { - if (j == filter_width - pad) { - flag = true; - break; - } - j = j + stride; - } - if (flag) { - break; - } - } - - return (stride - idx); -} - -template -void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n, - int kernel_num, int channel) { - T* ptr_tmp = *data_in; - int sub_num = kernel_num * sub_conv_n; - int sub_h = height / sub_conv_n; - int sub_w = width / sub_conv_n; - - int sub_filter_size = - kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n; - - T* ptr_sub_filter = - reinterpret_cast(fpga_malloc(sub_filter_size * sizeof(T))); - for (int idx = 0; idx < sub_conv_n; ++idx) { - for (int nn = 0; nn < sub_num; ++nn) { - int ni = nn % kernel_num; - - int woff = sub_conv_n - 1 - (nn / kernel_num); // - - for (int hh = 0; hh < sub_h; ++hh) { - int hi = hh * sub_conv_n + idx % sub_conv_n; - for (int ww = 0; ww < sub_w; ++ww) { - int wi = ww * sub_conv_n + woff; // 1 0 - - int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel; // - int kidx = ((ni * height + hi) * width + wi) * channel; // - - fpga_copy( - ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx, - (*data_in) + kidx, channel * sizeof(T)); - // for (int cc =0; cc < channel; ++cc) { - // ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] = - // (*data_in)[kidx + cc]; - // } - } - } - } - } - *data_in = ptr_sub_filter; - fpga_free(ptr_tmp); -} - -void deconv_NC_convert(float** filter_in, int kernel_num, int channels, - int hw) { - float* tmp = *filter_in; - float* ptr_filter = reinterpret_cast(paddle_mobile::fpga::fpga_malloc( - hw * kernel_num * channels * sizeof(float))); - - for (int c = 0; c < channels; ++c) { - for (int n = 0; n < kernel_num; ++n) { - paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c, - tmp + n * channels * hw + c * hw, - hw * sizeof(float)); - } - } - *filter_in = ptr_filter; - paddle_mobile::fpga::fpga_free(tmp); -} - -void deconv_format_filter(float** data_in, int num, int channel, int height, - int width, int group_num, float max, int stride) { - int data_size = channel * height * width * num; - - /*{ - float result2 = (float)0; - string filename = "origin_filter_data"; - api::savefile(filename, (void *)*data_in, data_size, result2); - }*/ - - deconv_inverse_filter(data_in, num, channel, width, height); - - /* { - float result2 = (float)0; - string filename = "inverse_filter_data"; - api::savefile(filename, (void *)*data_in, data_size, result2); - }*/ - - filter::quantize(data_in, data_size, max); - /* { - char result2 = (char)0; - string filename = "quantize_filter_data"; - api::savefile(filename, (void *)*data_in, data_size, result2); - }*/ - char** quantize_data = (char**)data_in; // NOLINT - - filter::convert_to_hwc(quantize_data, num, channel, height, width); - /*{ - char result2 = (char)0; - string filename = "convert_to_hwc_filter_data"; - api::savefile(filename, (void *)*quantize_data, data_size, - result2); - }*/ - - deconv_get_sub_filter(quantize_data, height, width, stride, num, - channel); - /*{ - char result2 = (char)0; - string filename = "sub_filter_filter_data"; - api::savefile(filename, (void *)*quantize_data, data_size, result2); -}*/ - - int sub_conv_n = stride; - int sub_h = height / sub_conv_n; - int sub_w = width / sub_conv_n; - int sub_chw = sub_h * sub_w * channel; - int sub_num = sub_conv_n * num; - int division_capacity = filter::calc_division_capacity(sub_chw); - int num_per_div_before_alignment = - filter::calc_num_per_div(sub_num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = (sub_num + num_per_div_before_alignment - 1) / - num_per_div_before_alignment; - int residual = (sub_num) % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - - char** ptr_ptr_data = - reinterpret_cast(fpga_malloc(sub_conv_n * sizeof(char*))); - int origin_offset = sub_chw * sub_num; - for (int i = 0; i < sub_conv_n; ++i) { - (ptr_ptr_data)[i] = - reinterpret_cast(fpga_malloc(origin_offset * sizeof(char))); - fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i, - origin_offset * sizeof(char)); - - /* char result2 = (char)0; - string filename = "ptr_ptr_data" + to_string(i); - api::savefile(filename, (void *)(ptr_ptr_data[i]), origin_offset, - result2); - */ - } - // char result2 = (char)0; - // string filename = "interleave"; - // api::savefile(filename, (void *)*ptr_ptr_data, origin_offset, - // result2); - fpga_free(*quantize_data); - - int align_offset = - align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment; - char* ptr_space = reinterpret_cast(fpga_malloc( - sub_conv_n * align_offset * sizeof(char))); // continuous space - for (int i = 0; i < sub_conv_n; ++i) { - char* ptr_tmp = (ptr_ptr_data)[i]; - - filter::align_element(&ptr_tmp, sub_num, sub_chw); - filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw); - - filter::reorder(&ptr_tmp, num_after_alignment, sub_chw); - filter::interleave(&ptr_tmp, num_after_alignment, sub_chw); - - /* char result2 = (char)0; - string filename = "interleave" + to_string(i); - api::savefile(filename, (void *)ptr_tmp, align_offset, result2); -*/ - fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset); - fpga_free(ptr_tmp); - } - fpga_free(ptr_ptr_data); - *data_in = reinterpret_cast(ptr_space); - - /* { - char result2 = (char)0; - string filename = "ptr_space"; - api::savefile(filename, (void *)ptr_space, sub_conv_n * - align_offset, result2); - }*/ - fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char)); -} - -void DWDconv_format_filter(float** data_in, int num, int channel, int height, - int width, float* scale_ptr, int stride) { - deconv_inverse_filter(data_in, num, channel, width, height); - - filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr); - int16_t** quantize_data = (int16_t**)data_in; // NOLINT - filter::convert_to_hwn(quantize_data, channel, height, width); - - deconv_get_sub_filter(quantize_data, height, width, stride, num, - channel); - - filter::align_element_n(quantize_data, channel, height, width); - fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) * - height * width * sizeof(int16_t)); -} - -} // namespace deconv_filter -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/deconv_filter.h b/mobile/src/fpga/V2/deconv_filter.h deleted file mode 100644 index f1a50b95c5..0000000000 --- a/mobile/src/fpga/V2/deconv_filter.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace fpga { -namespace deconv_filter { - -void deconv_inverse_filter(float** data_in, int num, int channel, int width, - int height); -int deconv_calc_sub_pad(int filter_axis, int pad, int stride); -int deconv_get_sub_filter_axis(int filter_axis, int stride); -int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis); -int deconv_get_omit(int stride, int filter_width, int pad); - -template -void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n, - int kernel_num, int channel); -void deconv_format_filter(float** data_in, int num, int channel, int height, - int width, int group_num, float max, int stride); -void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw); -void DWDconv_format_filter(float** data_in, int num, int channel, int height, - int width, float* scale_ptr, int stride); - -} // namespace deconv_filter -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/filter.cpp b/mobile/src/fpga/V2/filter.cpp deleted file mode 100644 index a281a7335c..0000000000 --- a/mobile/src/fpga/V2/filter.cpp +++ /dev/null @@ -1,362 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V2/filter.h" -#include -#include -#include "fpga/common/fpga_common.h" - -namespace paddle_mobile { -namespace fpga { -namespace filter { - -int calc_division_capacity(int chw) { - int n = 2048 / ((chw + 15) / 16) * 32; - return n < 2048 ? n : 2048; -} - -int calc_split_num(int num, int division_capacity) { - return (num + division_capacity - 1) / division_capacity; -} - -int calc_division_number(int num, int group_num, int division_capacity) { - // PADDLE_MOBILE_ENFORCE(num % group_num == 0, - // "Filter number should be divisible by group - // number"); - int split_num = calc_split_num(num, division_capacity); - // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - // "Split number or group number should be 1"); - return group_num * split_num; -} - -int calc_num_per_div(int num, int group_num, int division_capacity) { - // PADDLE_MOBILE_ENFORCE(num % group_num == 0, - // "Filter number should be divisible by group - // number"); - int split_num = calc_split_num(num, division_capacity); - // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - // "Split number or group number should be 1"); - if (group_num == 1) { - if (num > division_capacity) { - return division_capacity; - } else { - return num; - } - } else { - return (num + group_num - 1) / group_num; - } -} - -void convert_to_hwc(char **data_in, int num, int channel, int height, - int width) { - char *tmp = *data_in; - int chw = channel * height * width; - char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT - for (int n = 0; n < num; n++) { - int64_t amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + n * chw + offset_height + w * channel + c) = - *((*data_in)++); - } - } - } - } - - *data_in = data_tmp; - fpga_free(tmp); -} - -float find_max(float *data_in, int data_size) { - float max = 0.0; - for (int i = 0; i < data_size; ++i) { - float value = data_in[i]; - float abs = value > 0 ? value : -value; - max = std::max(max, abs); - } - return max; -} - -signed char float_to_int8(float fdata) { - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - return (signed char)fdata; -} - -void quantize(float **data_in, int data_size, float max) { - float *tmp = *data_in; - float fix_range = 127; - float scale = fix_range / max; - - signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = float_to_int8( - (*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale); - } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} - -void align_element(char **data_in, int num, int chw) { - int i = 0; - int j = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - if (align_chw != chw) { - char *tmp = *data_in; - char *data_tmp = - (char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num * align_chw); - for (j = 0; j < num; j++) { - memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw); - } - *data_in = data_tmp; - fpga_free(tmp); - } -} - -void align_num(char **data_in, int num_per_div_before_alignment, int num, - int chw) { - int i = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - - char *tmp = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_element = div_num * num_per_div_after_alignment * align_chw; - char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(char)); - - for (i = 0; i < div_num - 1; i++) { - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - num_per_div_before_alignment * align_chw); - } - - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - (num - (div_num - 1) * num_per_div_before_alignment) * align_chw); - - *data_in = data_tmp; - fpga_free(tmp); -} - -void reorder(char **data_in, int num_after_alignment, int chw) { - int index = 0; - int new_index; - - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; - for (index = 0; index < num_after_alignment; index++) { - new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) + - (index / 16 % 2 * 4); - memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align, - chw_align); - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void interleave(char **data_in, int num_after_alignment, int chw) { - int i = 0; - int j = 0; - int k = 0; - int interleave_per_num = 16; - - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; - int interleave_num = chw_align * 2 / interleave_per_num; - for (i = 0; i < num_after_alignment; i += 2) { - for (j = 0, k = 0; j < interleave_num; j += 2, k++) { - memcpy(data_tmp + i * chw_align + interleave_per_num * j, - *data_in + i * chw_align + interleave_per_num * k, - interleave_per_num); - memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1), - *data_in + (i + 1) * chw_align + interleave_per_num * k, - interleave_per_num); - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void format_filter(float **data_in, int num, int channel, int height, int width, - int group_num, float max) { - int data_size = channel * height * width * num; - int chw = channel * height * width; - - int division_capacity = calc_division_capacity(chw); - int num_per_div_before_alignment = - calc_num_per_div(num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int residual = num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_to_hwc(quantize_data, num, channel, height, width); - align_element(quantize_data, num, chw); - if (num_after_alignment != num) { - align_num(quantize_data, num_per_div_before_alignment, num, chw); - } - - reorder(quantize_data, num_after_alignment, chw); - interleave(quantize_data, num_after_alignment, chw); - fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * - num_after_alignment * sizeof(char)); -} - -void convert_fc_filter(char **data_in, int num, int chw) { - char *tmp = *data_in; - char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT - for (int n = 0; n < num; n++) { - for (int c = 0; c < chw; c++) { - data_tmp[n * chw + c] = (*data_in)[num * c + n]; - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void format_fc_filter(float **data_in, int num, int channel, int height, - int width, int group_num, float max) { - int data_size = channel * height * width * num; - int chw = channel * height * width; - - int division_capacity = calc_division_capacity(chw); - int num_per_div_before_alignment = - calc_num_per_div(num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int residual = num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - - quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_fc_filter(quantize_data, num, chw); - convert_to_hwc(quantize_data, num, channel, height, width); - align_element(quantize_data, num, chw); - if (num_after_alignment != num) { - align_num(quantize_data, num_per_div_before_alignment, num, chw); - } - reorder(quantize_data, num_after_alignment, chw); - interleave(quantize_data, num_after_alignment, chw); - fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * - num_after_alignment * sizeof(char)); -} -void convert_to_hwn(int16_t **data_in, int num, int height, int width) { - int16_t *tmp = *data_in; - int16_t *data_tmp = - (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - *(data_tmp + h * width * num + w * num + n) = *((*data_in)++); - } - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -void align_element_n(int16_t **data_in, int num, int height, int width) { - int unalign_n = num; - int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT); - if (unalign_n == align_n) { - return; - } else { - int16_t *tmp = *data_in; - - int num_element = height * width * align_n; - int16_t *data_tmp = - (int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(int16_t)); - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int offset_unalign = h * width * unalign_n + w * unalign_n; - int offset_align = h * width * align_n + w * align_n; - for (int n = 0; n < unalign_n; n++) { - data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n); - } - } - } - - *data_in = data_tmp; - fpga_free(tmp); - } -} -void quantize_to_fp16(float **data_in, int num, int height, int width, - float *scale_ptr) { - float *tmp = *data_in; - int size = num * height * width; - - int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t)); // NOLINT - for (int n = 0; n < num; n++) { - float scale_val = scale_ptr[n]; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int index = n * height * width + h * width + w; - tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val); - } - } - } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} -void format_dwconv_filter(float **data_in, int num, int height, int width, - float *scale_ptr) { - quantize_to_fp16(data_in, num, height, width, scale_ptr); - int16_t **quantize_data = (int16_t **)data_in; // NOLINT - convert_to_hwn(quantize_data, num, height, width); - align_element_n(quantize_data, num, height, width); - fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * - height * width * sizeof(int16_t)); -} - -void format_DWDeconv_filter(float **data_in, int num, int height, int width, - float *scale_ptr) { - quantize_to_fp16(data_in, num, height, width, scale_ptr); - int16_t **quantize_data = (int16_t **)data_in; // NOLINT - convert_to_hwn(quantize_data, num, height, width); - align_element_n(quantize_data, num, height, width); - fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * - height * width * sizeof(int16_t)); -} -} // namespace filter -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/filter.h b/mobile/src/fpga/V2/filter.h deleted file mode 100644 index 4812a75af2..0000000000 --- a/mobile/src/fpga/V2/filter.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -namespace paddle_mobile { -namespace fpga { -namespace filter { - -int calc_division_capacity(int chw); -int calc_split_num(int num, int division_capacity); -int calc_division_number(int num, int group_num, int division_capacity); -int calc_num_per_div(int num, int group_num, int division_capacity); -void convert_to_hwc(char** data_in, int num, int channel, int height, - int width); -float find_max(float* data_in, int data_size); -void quantize(float** data_in, int data_size, float max); -void align_element(char** data_in, int num, int chw); -void align_num(char** data_in, int num_per_div_before_alignment, int num, - int chw); -void reorder(char** data_in, int num_after_alignment, int chw); -void interleave(char** data_in, int num_after_alignment, int chw); -void format_filter(float** data_in, int num, int channel, int height, int width, - int group_num, float max); - -void convert_fc_filter(char** data_in, int num, int chw); -void format_fc_filter(float** data_in, int num, int channel, int height, - int width, int group_num, float max); - -void convert_to_hwn(int16_t** data_in, int num, int height, int width); -void align_element_n(int16_t** data_in, int num, int height, int width); -void quantize_to_fp16(float** data_in, int num, int height, int width, - float* scale_ptr); -void format_dwconv_filter(float** data_in, int num, int height, int width, - float* scale_ptr); - -} // namespace filter -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp deleted file mode 100644 index eda7837bd0..0000000000 --- a/mobile/src/fpga/V2/image.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/V2/image.h" - -namespace paddle_mobile { -namespace fpga { -namespace image { - -void convert_to_hwc(float **data_in, int channel, int height, int width, - int num) { - float *data_tmp = reinterpret_cast( - fpga_malloc(num * channel * height * width * sizeof(float))); - int64_t amount_per_row = width * channel; - for (int n = 0; n < num; n++) { - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + n * channel * height * width + offset_height + - w * channel + c) = *((*data_in)++); - } - } - } - } - *data_in = data_tmp; -} - -void convert_to_chw(float **data_in, int channel, int height, int width, - int num) { - float *data_tmp = - (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT - int64_t amount_per_side = width * height; - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + n * height * width * channel + c * amount_per_side + - width * h + w) = *((*data_in)++); - } - } - } - } - *data_in = data_tmp; -} - -void concat_images(int8_t **images_in, float **scales_in, void *image_out, - float *scale_out, int image_num, uint32_t *channel_num, - int height, int width) { - int i = 0; - int j = 0; - int k = 0; - int each_out_line_channel = 0; - int align_each_out_area_cw = 0; - int align_each_in_area_cw = 0; - int align_each_out_area_cw_differ = 0; - int tmp_channel = 0; - float Ck = 0.0f; - float So = scale_out[0]; - auto images_in_tmp = - (int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *)); // NOLINT - for (i = 0; i < image_num; i++) { - images_in_tmp[i] = reinterpret_cast(fpga::fpga_malloc( - height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * - sizeof(int8_t))); - } - for (i = 0; i < image_num; i++) { - each_out_line_channel += channel_num[i]; - float Si_k = scales_in[i][0]; - Ck = Si_k / So; - fpga_invalidate(images_in[i], - height * - align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * - sizeof(int8_t)); - } - align_each_out_area_cw = - align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); - align_each_out_area_cw_differ = - align_each_out_area_cw - each_out_line_channel * width; - - for (k = 0; k < height; k++) { - for (j = 0; j < width; j++) { - for (i = 0; i < image_num; i++) { - align_each_in_area_cw = - align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); - memcpy((int8_t *)image_out + tmp_channel + // NOLINT - k * align_each_out_area_cw_differ, - images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, - channel_num[i] * sizeof(int8_t)); - - tmp_channel += channel_num[i]; - } - } - } - fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t)); - for (i = 0; i < image_num; i++) { - fpga_free(images_in_tmp[i]); - } - fpga_free(images_in_tmp); -} - -void split_image(int8_t *image_in, void **images_out, int image_num, - const uint32_t *channel_nums, int height, int width) { - int total_channel = 0; - for (int i = 0; i < image_num; i++) { - total_channel += channel_nums[i]; - } - int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT); - fpga_invalidate(image_in, element_num * sizeof(int8_t)); - int src_offset = 0, des_offset = 0; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) + - w * total_channel; - for (int i = 0; i < image_num; i++) { - des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + - w * channel_nums[i]; - memcpy(reinterpret_cast(images_out[i]) + des_offset, - image_in + src_offset, channel_nums[i] * sizeof(int8_t)); - src_offset += channel_nums[i]; - } - } - } - - for (int i = 0; i < image_num; i++) { - element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT); - fpga_flush(images_out[i], element_num * sizeof(int8_t)); - } -} - -} // namespace image -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/image.h b/mobile/src/fpga/V2/image.h deleted file mode 100644 index 11988ee11d..0000000000 --- a/mobile/src/fpga/V2/image.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "fpga/common/fpga_common.h" -namespace paddle_mobile { -namespace fpga { -namespace image { - -void convert_to_hwc(float** data_in, int channel, int height, int width, - int num = 1); -void convert_to_chw(float** data_in, int channel, int height, int width, - int num = 1); -template -void align_element_conv(Dtype** data_in, int height, int cw); -template -void align_element_conv(Dtype** data_in, int height, int cw) { - int h = 0; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - - Dtype* data_tmp = - (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype)); // NOLINT - - memset(data_tmp, 0, height * align_cw * sizeof(Dtype)); - - for (h = 0; h < height; h++) { - memcpy((void*)(data_tmp + h * align_cw), // NOLINT - (void*)(*data_in + h * cw), // NOLINT - cw * sizeof(Dtype)); - } - - *data_in = data_tmp; -} -template -void format_image(T** data_in, int channel, int height, int width) { - int cw = channel * width; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - if (align_cw != cw) { - T* hwc_temp = *data_in; - align_element_conv(data_in, height, channel * width); - fpga_free(hwc_temp); - } - fpga_flush(*data_in, - align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T)); -} -// Concat featuremaps along channel direction -void concat_images(int8_t** images_in, float** scales_in, void* image_out, - float* scale_out, int image_num, uint32_t* channel_num, - int height, int width); - -// Split featuremap along channel direction -void split_image(int8_t* image_in, void** images_out, int image_num, - const uint32_t* channel_nums, int height, int width); -} // namespace image -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp deleted file mode 100644 index 585ab6706e..0000000000 --- a/mobile/src/fpga/V2/pe.cpp +++ /dev/null @@ -1,1138 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/common/pe.h" -#include "common/enforce.h" -#include "common/types.h" -#include "fpga/V2/filter.h" -#include "fpga/V2/image.h" -#include "fpga/common/config.h" -#include "fpga/common/driver.h" -#include "fpga/common/fpga_common.h" -#ifdef COST_TIME_PRINT -#include -#include -#include -#include -#endif - -namespace paddle_mobile { -namespace fpga { - -using namespace driver; // NOLINT -using namespace std; // NOLINT -#define USE_RELU 1 -#define USE_BIAS 2 - -// bypass cmd -#define CMD_FP16_TO_FP16 0 -#define CMD_FP16_TO_FP32 1 -#define CMD_FP32_TO_FP16 2 -#define CMD_FP32_TO_FP32 3 -#define CMD_INT8_TO_FP16 4 - -// bypass macro -#define SIZE_FP16 2 -#define SIZE_FP32 4 -#define SIZE_INT8 1 - -#define PE_IRQ_TIMEOUT 1000000 - -/* Interrupt bit-set offset*/ -#define INTERRUPT_RSVD 0x0001 -#define INTERRUPT_BYPASS 0x0002 -#define INTERRUPT_CONV 0x0004 -#define INTERRUPT_POOLING 0x0008 -#define INTERRUPT_EW 0x0010 - -/* Register offset */ -#define REG_INTERRUPT 0x000 -#define REG_VERSION 0x008 -#define REG_TEMPERATURE 0x010 -#define REG_FPGA_RESET 0x018 -#define REG_TEST_REGISTER 0x048 -#define REG_HARDWARE_STATUS 0x050 - -#define REG_TIMER_COUNTER 0x070 - -#define REG_SCALE_PARAMETER 0x080 -#define REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR 0x090 - -#define REG_FLASH_CMD 0x200 -#define REG_FLASH_DATA 0x208 -#define REG_FLASH_CONFIG 0x210 -#define REG_FLASH_STATUS 0x218 -#define REG_SN 0x220 - -/*bypass*/ -#define REG_CONVERT_CMD 0x400 -#define REG_CONVERT_SRC_ADDR 0x408 -#define REG_CONVERT_DST_ADDR 0x410 -#define REG_CONVERT_RD_LENGTH 0x418 -#define REG_CONVERT_WR_LENGTH 0x420 - -/*resize*/ -#define REG_RESIZE_CMD 0x600 -#define REG_RESIZE_CHANNEL_NUMBER 0x608 -#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610 -#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618 -#define REG_RESIZE_INPUT_BASE_ADDR 0x620 -#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628 -#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630 -#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638 - -/*pooling*/ -#define REG_POOLING_CMD 0x800 -#define REG_POOLING_IMAGE_BASE_ADDR 0x808 -#define REG_POOLING_RESULT_BASE_ADDR 0x810 -#define REG_POOLING_IMAGE_PIXEL 0x818 -#define REG_POOLING_WINDOW_SIZE 0x820 -#define REG_POOLING_RESULT_PIXEL 0x828 -#define REG_POOLING_PAD_PIXEL 0x830 -#define REG_POOLING_STEP_PIXEL 0x838 -#define REG_POOLING_CHANNEL_NUMBER 0x840 -#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848 -#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850 -#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858 -#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860 -#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 -#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 -#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 -#define REG_POOLING_RESULT_AMOUNT_ALIGN_16 0x880 -#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 -#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 -#define REG_POOLING_MODE_RECIPROCAL 0x890 - -/*conv*/ -#define REG_CONV_CMD 0xC00 -#define REG_CONV_REG0 0xC08 -#define REG_CONV_REG1 0xC10 -#define REG_CONV_REG2 0xC18 -#define REG_CONV_REG3 0xC20 -#define REG_CONV_REG4 0xC28 -#define REG_CONV_REG5 0xC30 -#define REG_CONV_REG6 0xC38 -#define REG_CONV_REG7 0xC40 -#define REG_CONV_REG8 0xC48 -#define REG_CONV_REG9 0xC50 -#define REG_CONV_REG10 0xC58 -#define REG_CONV_REG11 0xC60 - -#define REG_CONV_IMAGE_BASE_ADDR 0xC08 -#define REG_CONV_FILTER_BASE_ADDR 0xC10 -#define REG_CONV_SB_BASE_ADDR 0xC18 -#define REG_CONV_RESULT_BASE_ADDR 0xC20 -#define REG_CONV_IMAGE_PIXEL 0xC28 -#define REG_CONV_FILTER_PIXEL 0xC30 -#define REG_CONV_RESULT_PIXEL 0xC38 -#define REG_CONV_PAD_PIXEL 0xC40 -#define REG_CONV_STEP_PIXEL 0xC48 -#define REG_CONV_GROUP_NUMBER 0xC50 -#define REG_CONV_FILTER_NUMBER 0xC58 -#define REG_CONV_CHANNEL_NUMBER 0xC60 -#define REG_CONV_FILTER_PER_GROUP 0xC68 -#define REG_CONV_CHANNEL_PER_GROUP 0xC70 -#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78 -#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80 -#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88 -#define REG_CONV_FILTER_AMOUNT_ALL 0xC90 -#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98 -#define REG_CONV_RESULT_LAST_VALID 0xCA0 - -#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8 -#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0 -#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8 -#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0 -#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8 -#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0 -#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8 -#define REG_CONV_IMAGE_WIN_CNT 0xCE0 -#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8 -#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8 -#define REG_CONV_PROG_FULL_CNT 0xD08 -#define REG_CONV_POST_PROG_FULL_CNT 0xD10 -#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20 - -#define REG_CONV_IMAGE_SCALE 0xD28 -#define REG_CONV_FILTER_SCALE 0xD30 - -/*ew*/ -#define REG_EW_CMD 0x0F00 -#define REG_EW_IMAGE0_BASE_ADDR 0x0F08 -#define REG_EW_IMAGE1_BASE_ADDR 0x0F10 -#define REG_EW_RESULT_BASE_ADDR 0x0F18 -#define REG_EW_DATA_LEN 0x0F20 -#define REG_EW_COEFFICIENT 0x0F28 -#define REG_EW_IMAGE_PIXEL 0x0F30 -#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38 - -/*dwconv*/ -#define REG_DWCONV_FILTER_BASE_ADDR 0xe08 -#define REG_DWCONV_FILTER_SHAPE 0xe10 -#define REG_DWCONV_FILTER_N_ALIGN 0xe18 -#define REG_DWCONV_FILTER_SUBNUMBER 0xe20 -#define REG_DWCONV_CMD 0xe00 - -int ComputeFpgaConv(const struct SplitConvArgs &args) { -// ComputeBasicConv(args.conv_arg[0]); -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFPGAConv==========="; - DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num - << " split_num:" << args.split_num; -#endif - int ret = 0; - int split_num = args.split_num; - for (int i = 0; i < split_num; i++) { - ret |= ComputeBasicConv(args.conv_arg[i]); - } - - if (split_num > 1) { - ComputeFPGAConcat(args.concat_arg); - } - - return ret; -} - -int ComputeBasicConv(const struct ConvArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "======Compute Basic Conv======"; - DLOG << " relu_enabled:" << args.relu_enabled; - DLOG << " sb_address:" << args.sb_address - << " filter_address:" << args.filter_address - << " filter_num:" << args.filter_num - << " group_num:" << args.group_num; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - -#ifdef PADDLE_MOBILE_ZU5 - int ret = 0; - uint64_t output_scale = 0; - - // uint64_t reg_ActivationArgs = 0; - // active function:{none,leakeyrelu,sigmoid,tanh} - // ActivationArgs active_args; - // active_args.activation_type = LEAKYRELU; - - // active_args.activation_type = args.output.activation.activation_type; - - // active_args.leaky_relu_negative_slope = - // args.output.activation.leaky_relu_negative_slope; - - // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - // active_args.leaky_relu_negative_slope; - - // DLOG << " activation_type:" << active_args.activation_type - // << " leaky_relu_negative_slope:" - // << active_args.leaky_relu_negative_slope; - DLOG << " reg_ActivationArgs:"; - uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { - ret = -EIO; - DLOG << "Conv Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - // reg_writeq(reg_ActivationArgs, - // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - - reg_writeq(output_scale, REG_SCALE_PARAMETER); - // new - reg_writeq((args.driver.row_padding_down << 45) | - (args.driver.row_padding_up << 34) | - (args.driver.col_padding_down << 17) | - args.driver.col_padding_up, - REG_CONV_REG0); - - reg_writeq((args.driver.image_win_cnt_last << 50) | - (args.driver.image_win_cnt << 39) | - (args.driver.image_block_amount_per_row << 20) | - args.driver.filter_pad_width_mul_channel, - REG_CONV_REG1); - - reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) | - (args.driver.filter_row << 10) | - (args.driver.filter_height << 5) | args.driver.filter_width, - REG_CONV_REG2); - - reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) | - (args.driver.prog_full_cnt << 16) | - args.driver.filter_amount_all, - REG_CONV_REG3); - - reg_writeq((args.driver.post_prog_full_cnt << 54) | - (args.driver.last_cal_res_row_num << 50) | - (args.driver.cal_res_num << 39) | - (args.driver.res_row_data_align4_pad << 35) | - (args.driver.output_amount_per_row << 16) | - args.driver.output_width, - REG_CONV_REG4); - - reg_writeq((args.driver.deconv_dump << 40) | (args.driver.deconv_ena << 39) | - (args.driver.deconv_res_skip_row << 7) | - args.driver.deconv_skip_row, - REG_CONV_REG5); - - reg_writeq((args.driver.result_amount_per_row_multi_para << 43) | - (args.driver.output_height << 32) | - args.driver.output_address_phy, - REG_CONV_REG6); - - reg_writeq((args.driver.filter_amount_whole << 48) | - (args.driver.fpga_bias_scale_len << 32) | - args.driver.sb_address_phy, - REG_CONV_REG7); - - reg_writeq( - (args.driver.filters_amount_whole << 32) | args.driver.filter_address_phy, - REG_CONV_REG8); - - reg_writeq((args.driver.image_amount_per_row << 43) | - (args.driver.image_hight << 32) | - args.driver.image_address_phy, - REG_CONV_REG9); - - reg_writeq((args.driver.filter_pad_hight << 46) | - (args.driver.image_amount_per_row_multi_win << 23) | - args.driver.image_amount_per_row_multi_win_first, - REG_CONV_REG10); - - reg_writeq((args.driver.image_block_num << 48) | - (args.driver.image_block_len << 24) | - args.driver.image_block_len_last, - REG_CONV_REG11); - - reg_writeq(args.driver.cmd, REG_CONV_CMD); - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; - ret = -EIO; - DLOG << "Conv Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout"); - } - DLOG << "after reg poll"; - - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - - return ret; -#endif - return 0; -} // ComputeBasicConv - -int ComputeFpgaPool(const struct PoolingArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaPool==========="; - DLOG << " mode:" << args.mode - << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif -#ifdef PADDLE_MOBILE_ZU5 - // return 0; - uint64_t output_scale = 0; - uint64_t timer_cnt = 0; - int ret = 0; - uint64_t cmd = 0; - uint64_t image_physical_address = 0; - uint64_t output_physical_address = 0; - uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); - image_physical_address = vaddr_to_paddr(args.image.address); - output_physical_address = vaddr_to_paddr(args.output.address); - uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64); - uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); - uint64_t output_height = (uint64_t)( - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint64_t output_width = (uint64_t)( - (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1); - - uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_one_pad_per_row = - (uint64_t)args.image.width * (uint64_t)args.image.channels + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32); - uint64_t result_addr_row = - (result_amount_align_32 << 32) | output_physical_address; - uint64_t row_padding_down = - (uint64_t)args.image.height + (uint64_t)args.image.pad_height; - uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1; - uint64_t kernel_padding_step = row_padding_down | - ((uint64_t)args.image.pad_height << 16) | - ((uint64_t)args.kernel.stride_h << 24) | - ((uint64_t)kernel_width_sub1 << 32) | - ((uint64_t)args.kernel.height << 40) | - ((uint64_t)(args.kernel.height - 1) << 48); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - (output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t result_size_calcu_height = (output_height - 1) | - ((output_width - 1) << 16) | - (image_calcu_height << 32); - uint64_t col_padding_down = - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width) * - (uint64_t)args.image.channels; - - uint64_t image_row_col_padding_down = - image_amount_per_row | (col_padding_down << 32); - uint64_t image_rowXpadding_h = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_rowXstep_h = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t image_rowXpad_h_rowXstep_h = - image_rowXpadding_h | (image_rowXstep_h << 32); - uint64_t channelXpad_w = - (uint64_t)args.image.channels * (uint64_t)args.image.pad_width; - uint64_t channelXstep_w = - (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; - uint64_t channelXpad_w_channelXstep_w = - channelXpad_w | (channelXstep_w << 32); - uint64_t filter_row_align = C_align_32 * (uint64_t)args.kernel.width; - uint64_t sub_filter_amount_align = - C_align_32 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height; - uint64_t mult_factor = 0; - float average_reciprocal = args.kernel_reciprocal; - uint32_t *kernel_reciprocal; - kernel_reciprocal = (reinterpret_cast(&average_reciprocal)); - if (args.mode == 1) - mult_factor = (uint64_t)(*kernel_reciprocal) | ((uint64_t)1 << 32) | - ((uint64_t)1 << 40); - else - mult_factor = - (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40); - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { - ret = -EIO; - DLOG << "Conv Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(image_physical_address, 0x808); - reg_writeq(result_addr_row, 0x810); - reg_writeq(kernel_padding_step, 0x818); - reg_writeq(result_size_calcu_height, 0x820); - reg_writeq((uint64_t)args.image.channels, 0x828); - reg_writeq(image_row_col_padding_down, 0x830); - reg_writeq(image_rowXpad_h_rowXstep_h, 0x838); - reg_writeq(mult_factor, 0x840); // dw donot care - reg_writeq(channelXpad_w_channelXstep_w, 0x848); - if (args.mode == 1) - cmd = (uint64_t)4; - else - cmd = (uint64_t)8; - reg_writeq(cmd, 0x800); - - DLOG << "before reg poll"; - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; - ret = -EIO; - DLOG << "Pooling Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "Pooling Wait Irq Timeout!"); - } - DLOG << "after reg poll"; - - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - - return ret; -#endif - return 0; -} // ComputeFpgaPool - -int ComputeFpgaEWAdd(const struct EWAddArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaEWAdd==========="; - DLOG << " relu_enabled:" << args.relu_enabled; - DLOG << " const0:" << fp16_2_fp32(int16_t(args.const0)) - << " const1:" << fp16_2_fp32(int16_t(args.const1)); - DLOG << " image0_address:" << args.image0.address - << " image0_scale_address:" << args.image0.scale_address - << " image0_channels:" << args.image0.channels - << " image0_height:" << args.image0.height - << " image0_width:" << args.image0.width - << " pad0_height:" << args.image0.pad_height - << " pad0_width:" << args.image0.pad_width; - DLOG << " image1_address:" << args.image1.address - << " image1_scale_address:" << args.image1.scale_address - << " image1_channels:" << args.image1.channels - << " image1_height:" << args.image1.height - << " image1_width:" << args.image1.width - << " pad1_height:" << args.image1.pad_height - << " pad_width:" << args.image1.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif -#ifdef PADDLE_MOBILE_ZU5 - int ret = 0; - uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); - - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { - ret = -EIO; - DLOG << "EW Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - - uint64_t image0_physical_address = 0; - uint64_t image1_physical_address = 0; - uint64_t image_physical_address = 0; - uint64_t output_physical_address = 0; - image0_physical_address = vaddr_to_paddr(args.image0.address); - image1_physical_address = vaddr_to_paddr(args.image1.address); - image_physical_address = - image0_physical_address | (image1_physical_address << 32); - output_physical_address = vaddr_to_paddr(args.output.address); - uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, - IMAGE_ALIGNMENT); - uint64_t result_addr_row = - output_physical_address | (image_amount_per_row << 32); - uint64_t kernel_padding_step = 0; - kernel_padding_step = ((uint64_t)args.image0.height * 2) | - ((uint64_t)2 << 24) | ((uint64_t)2 << 40) | - ((uint64_t)1 << 48); - uint64_t result_size_calcu_height = - ((uint64_t)args.image0.height - 1) | - ((image_amount_per_row / 32 - 1) << 16) | - (((uint64_t)args.image0.height * 2) << 32); - uint64_t image_row_col_padding_down = - image_amount_per_row | (image_amount_per_row << 32); - float quantParam = - ((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]); - uint32_t *ew_scale = reinterpret_cast(&quantParam); - uint64_t ew_scale_mult_factor = (*ew_scale) | ((uint64_t)args.const0 << 32) | - ((uint64_t)args.const1 << 40); - reg_writeq(0ul, REG_SCALE_PARAMETER); - reg_writeq(image_physical_address, 0x808); - reg_writeq(result_addr_row, 0x810); - reg_writeq(kernel_padding_step, 0x818); - reg_writeq(result_size_calcu_height, 0x820); - reg_writeq(32, 0x828); - reg_writeq(image_row_col_padding_down, 0x830); - reg_writeq(((image_amount_per_row * 2) << 32), 0x838); - reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care - reg_writeq(((uint64_t)32 << 32), 0x848); - reg_writeq(0, 0x858); - uint64_t cmd = 0; - cmd = (uint64_t)2 | (((uint64_t)args.relu_enabled) << 8); - reg_writeq(cmd, 0x800); - - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR; - ret = -EIO; - DLOG << "EW Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!"); - } - - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; -#endif - return 0; -} // ComputeFpgaEWAdd - -int PerformBypass(const struct BypassArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaBypass==========="; - DLOG << " input_type:" << args.input_data_type - << " output_type:" << args.output_data_type - << " input_layout_type:" << args.input_layout_type - << " output_layout_type:" << args.output_layout_type; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif -#ifdef PADDLE_MOBILE_ZU5 - uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); - uint64_t output_scale = 0; - uint64_t timer_cnt = 0; - uint64_t cmd = 0; - uint64_t datalen = 0; - uint64_t input_address_phy = 0; - uint64_t output_address_phy = 0; - uint8_t data_cell_in = 0; - uint8_t data_cell_out = 0; - int ret = 0; - - uint64_t reg_ActivationArgs = 0; - ActivationArgs active_args; - active_args.activation_type = args.output.activation.activation_type; - - active_args.leaky_relu_negative_slope = - args.output.activation.leaky_relu_negative_slope; - - reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - active_args.leaky_relu_negative_slope; - - datalen = (uint64_t)args.image.width * (uint64_t)args.image.height * - (uint64_t)args.image.channels; - datalen = align_to_x(datalen, 16); - input_address_phy = vaddr_to_paddr_driver(args.image.address); - output_address_phy = vaddr_to_paddr_driver(args.output.address); - DLOG << "input_phy:" << input_address_phy; - DLOG << "output_phy:" << output_address_phy; - - switch (args.input_data_type) { - case DATA_TYPE_FP16: { - switch (args.output_data_type) { - case DATA_TYPE_FP16: - data_cell_in = SIZE_FP16; - data_cell_out = SIZE_FP16; - cmd = CMD_FP16_TO_FP16; - break; - - case DATA_TYPE_FP32: - data_cell_in = SIZE_FP16; - data_cell_out = SIZE_FP32; - cmd = CMD_FP16_TO_FP32; - break; - - default: - break; - } - } break; - - case DATA_TYPE_INT8: { - if (args.output_data_type != DATA_TYPE_FP16) { - DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: " - << args.output_data_type; - } - data_cell_in = SIZE_INT8; - data_cell_out = SIZE_FP16; - cmd = CMD_INT8_TO_FP16; - } break; - - case DATA_TYPE_FP32: { - switch (args.output_data_type) { - case DATA_TYPE_FP16: - data_cell_in = SIZE_FP32; - data_cell_out = SIZE_FP16; - cmd = CMD_FP32_TO_FP16; - break; - - case DATA_TYPE_FP32: - data_cell_in = SIZE_FP32; - data_cell_out = SIZE_FP32; - cmd = CMD_FP32_TO_FP32; - break; - - default: - break; - } - } break; - - default: - break; - } - if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 && - cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 && - cmd != CMD_INT8_TO_FP16) { - // std::cout<< " err back Error1!" <mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) { - ret = -EIO; - DLOG << "Bypass Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - reg_writeq(reg_ActivationArgs, - REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR); - reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR); - reg_writeq(datalen, REG_CONVERT_RD_LENGTH); - reg_writeq(datalen, REG_CONVERT_WR_LENGTH); - reg_writeq(cmd, REG_CONVERT_CMD); - DLOG << "before reg poll"; - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR; - ret = -EIO; - DLOG << "BYPASS Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "BYPASS Wait Irq Timeout!"); - } - DLOG << "after reg poll"; - - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; -#endif - return 0; -} // PerformBypass - -uint64_t FPGAVersion() { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaBypass==========="; -#endif -#ifdef PADDLE_MOBILE_ZU5 - uint64_t fpga_ver = 0; - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - fpga_ver = reg_readq(REG_HARDWARE_STATUS); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return fpga_ver; -#endif - return 0; -} // FPGAVersion - -int ComputeFPGAConcat(const struct ConcatArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaConcat==========="; - DLOG << " Image_num: " << args.image_num - << " out_address:" << args.image_out - << " out_scale_address:" << args.scale_out - << " out_channel:" << args.out_channel; - DLOG << " image_height:" << args.height << " image_width:" << args.width; - for (int i = 0; i < args.image_num; i++) { - DLOG << " " << i << "th: "; - DLOG << " channel_num:" - << args.channel_num[i] - //<< " aligned_channel_num:" << args.aligned_channel_num[i] - << " image_address:" << args.images_in[i] - << " image_scale_address:" << args.scales_in[i]; - } -#endif - - image::concat_images(args.images_in, args.scales_in, args.image_out, - args.scale_out, args.image_num, args.channel_num, - args.height, args.width); - return 0; -} // ComputeFPGAConcat - -void deconv_post_process(const struct DeconvArgs &args) { - int sub_conv_n = args.sub_conv_num; - int sub_height = args.sub_output_height; - int sub_width = args.sub_output_width; - int omit_size = args.omit_size; - int channel = args.filter_num; - int num = 1; - int origin_h = sub_height * sub_conv_n; - int origin_w = sub_width * sub_conv_n; - int align_origin_w = align_to_x(origin_w * channel, 16); - int deconv_h = origin_h - 2 * omit_size; - int deconv_w = origin_w - 2 * omit_size; - int deconv_row_len = deconv_w * channel; - int align_deconv_row_len = align_to_x(deconv_row_len, 16); - - for (int idx = 0; idx < sub_conv_n; ++idx) { - paddle_mobile::fpga::fpga_invalidate( - args.split_conv_args[idx]->output.address, - align_origin_w * origin_h * sizeof(int16_t)); - } - - int deconv_idx = 0; - for (int nn = 0; nn < num; ++nn) { - for (int hh = 0; hh < origin_h; ++hh) { - int hx = (hh % sub_conv_n); - auto sub_t = - (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1] // NOLINT - ->output.address); - int hi = (hh / sub_conv_n); - if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; - int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + - omit_size * channel); - fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT - sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT - deconv_idx += align_deconv_row_len; - } - } - fpga_flush(args.output.address, - num * align_deconv_row_len * deconv_h * sizeof(int16_t)); -} -void DWDeconv_post_process(const struct DWDeconvArgs &args) { - int sub_conv_n = args.sub_conv_num; - int sub_height = args.sub_output_height; - int sub_width = args.sub_output_width; - int omit_size = args.omit_size; - int channel = args.filter_num; - int num = 1; - int origin_h = sub_height * sub_conv_n; - int origin_w = sub_width * sub_conv_n; - int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT); - int deconv_h = origin_h - 2 * omit_size; - int deconv_w = origin_w - 2 * omit_size; - int deconv_row_len = deconv_w * channel; - int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT); - - for (int idx = 0; idx < sub_conv_n; ++idx) { - paddle_mobile::fpga::fpga_invalidate( - args.dw_conv_args[idx]->output.address, - align_origin_w * origin_h * sizeof(int16_t)); - } - - int deconv_idx = 0; - for (int nn = 0; nn < num; ++nn) { - for (int hh = 0; hh < origin_h; ++hh) { - int hx = (hh % sub_conv_n); - auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1] // NOLINT - ->output.address); - int hi = (hh / sub_conv_n); - if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; - int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + - omit_size * channel); - fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT - sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT - deconv_idx += align_deconv_row_len; - } - } - fpga_flush(args.output.address, - num * align_deconv_row_len * deconv_h * sizeof(int16_t)); -} - -int ComputeFpgaDeconv(const struct DeconvArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFPGADeConv==========="; - DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num << "omit_size:" << args.omit_size - << "sub_output_width: " << args.sub_output_width - << "sub_output_height: " << args.sub_output_height - << " sub_conv_num:" << args.sub_conv_num; - DLOG << "args.output.address: " << args.output.address - << "args.output.scale_address: " << args.output.scale_address; - -#endif - - int sub_conv_num = args.sub_conv_num; - -#ifdef COST_TIME_PRINT - timeval start, end; - long dif_sec, dif_usec; // NOLINT -#endif - - for (int i = 0; i < sub_conv_num; i++) { -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - - ComputeFpgaConv(*args.split_conv_args[i]); -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv basic_conv: " << i << " times: " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - } - - /*if (sub_conv_num > 1) { - float max_scale = -1.0f; -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - for (int i = 0; i < sub_conv_num; i++) { - paddle_mobile::fpga::fpga_invalidate( - args.split_conv_args[i]->output.scale_address, 2 * sizeof(float)); - float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0]; - if (ptr_scale > max_scale) { - args.output.scale_address[0] = ptr_scale; - args.output.scale_address[1] = - (args.split_conv_args[i]->output.scale_address)[1]; - } - } - -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv scale " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - }*/ - - return 0; -} // ComputeFpgaDeconv - -int ComputeFPGASplit(const struct SplitArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFpgaSplit==========="; - DLOG << " Image_num: " << args.image_num - << " in_address:" << args.image_in - << " in_scale_address:" << args.scale_in; - DLOG << " image_height:" << args.height << " image_width:" << args.width; - for (int i = 0; i < args.image_num; i++) { - DLOG << " " << i << "th: "; - DLOG << " channel_num:" << args.out_channel_nums[i] - << " image_address:" << args.images_out[i] - << " image_scale_address:" << args.scales_out[i]; - } -#endif - image::split_image(args.image_in, args.images_out, args.image_num, - args.out_channel_nums, args.height, args.width); - return 0; -} // ComputeFPGASplit -int ComputeDWConv(const struct DWconvArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeDWConv==========="; - // DLOG << " mode:" << args.relu_enabled; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " filter_address:" << args.filter_address; - //<< " bias_address:" << args.bias_address; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif -#ifdef PADDLE_MOBILE_ZU5 - DLOG << "DWConv"; - uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); - // return 0; - uint64_t timer_cnt = 0; - int ret = 0; - uint64_t cmd = 0; - uint64_t image_physical_address = 0; - uint64_t output_physical_address = 0; - uint64_t filter_physical_address = 0; - uint64_t bias_physical_address = 0; - - image_physical_address = vaddr_to_paddr(args.image.address); - output_physical_address = vaddr_to_paddr(args.output.address); - filter_physical_address = vaddr_to_paddr(args.filter_address); - bias_physical_address = vaddr_to_paddr(args.bias_address); - uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64); - uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); - uint64_t output_height = (uint64_t)( - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint64_t output_width = (uint64_t)( - ((args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1) * - args.sub_conv_num); - - uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_one_pad_per_row = - (uint64_t)args.image.width * (uint64_t)args.image.channels + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32); - uint64_t result_addr_row = - (result_amount_align_32 << 32) | output_physical_address; - uint64_t row_padding_down = - (uint64_t)args.image.height + (uint64_t)args.image.pad_height; - uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1; - uint64_t kernel_padding_step = row_padding_down | - ((uint64_t)args.image.pad_height << 16) | - ((uint64_t)args.kernel.stride_h << 24) | - ((uint64_t)kernel_width_sub1 << 32) | - ((uint64_t)args.kernel.height << 40) | - ((uint64_t)(args.kernel.height - 1) << 48); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - (output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t result_size_calcu_height = (output_height - 1) | - ((output_width - 1) << 16) | - (image_calcu_height << 32); - uint64_t col_padding_down = - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width) * - (uint64_t)args.image.channels; - - uint64_t image_row_col_padding_down = - image_amount_per_row | (col_padding_down << 32); - uint64_t image_rowXpadding_h = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_rowXstep_h = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t image_rowXpad_h_rowXstep_h = - image_rowXpadding_h | (image_rowXstep_h << 32); - uint64_t channelXpad_w = - (uint64_t)args.image.channels * (uint64_t)args.image.pad_width; - uint64_t channelXstep_w = - (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; - uint64_t channelXpad_w_channelXstep_w = - channelXpad_w | (channelXstep_w << 32); - - uint64_t filter_row_align = C_align_64 * (uint64_t)args.kernel.width; - uint64_t sub_filter_amount_align = - C_align_64 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height; - uint64_t filter_amount_align = - sub_filter_amount_align * (uint64_t)args.sub_conv_num; - uint64_t filter_param = filter_row_align | (filter_amount_align << 16) | - (sub_filter_amount_align << 32) | - (((uint64_t)args.sub_conv_num - 1) << 48); - uint64_t channel_parameter = - (uint64_t)args.image.channels | (C_align_64 << 16); - pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); - if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { - ret = -EIO; - DLOG << "Conv Status Error!"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; - } - reg_writeq(0ul, REG_SCALE_PARAMETER); - reg_writeq(image_physical_address, 0x808); - reg_writeq(result_addr_row, 0x810); - reg_writeq(kernel_padding_step, 0x818); - reg_writeq(result_size_calcu_height, 0x820); - reg_writeq(channel_parameter, 0x828); - reg_writeq(image_row_col_padding_down, 0x830); - reg_writeq(image_rowXpad_h_rowXstep_h, 0x838); - reg_writeq(0, 0x840); - reg_writeq(channelXpad_w_channelXstep_w, 0x848); - reg_writeq(filter_physical_address, 0x850); - reg_writeq(filter_param, 0x858); - reg_writeq(((bias_physical_address + C_align_64 * 4) | - (bias_physical_address << 32)), - 0x860); - cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8); - reg_writeq(cmd, 0x800); - - DLOG << "before reg poll"; - if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { - g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; - ret = -EIO; - DLOG << "DWconv Wait Irq Timeout!"; - PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout"); - } - DLOG << "after reg poll"; - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); - return ret; -#endif - return 0; -} -int ComputeDWDeconv(const struct DWDeconvArgs &args) { -#ifdef FPGA_PRINT_MODE - DLOG << "=============ComputeFPGADeConv==========="; - DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num << "omit_size:" << args.omit_size - << "sub_output_width: " << args.sub_output_width - << "sub_output_height: " << args.sub_output_height - << " sub_conv_num:" << args.sub_conv_num; - DLOG << "args.output.address: " << args.output.address - << "args.output.scale_address: " << args.output.scale_address; - -#endif - - int sub_conv_num = args.sub_conv_num; - -#ifdef COST_TIME_PRINT - timeval start, end; - long dif_sec, dif_usec; // NOLINT -#endif - - for (int i = 0; i < sub_conv_num; i++) { -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - - ComputeDWConv(*args.dw_conv_args[i]); -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv basic_conv: " << i << " times: " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - } - - if (sub_conv_num > 1) { - float max_scale = -1.0f; -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - for (int i = 0; i < sub_conv_num; i++) { - paddle_mobile::fpga::fpga_invalidate( - args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float)); - float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0]; - if (ptr_scale > max_scale) { - args.output.scale_address[0] = ptr_scale; - args.output.scale_address[1] = - (args.dw_conv_args[i]->output.scale_address)[1]; - } - } - -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv scale " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - } - -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - DWDeconv_post_process(args); -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv_post_process " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif - return 0; -} // ComputeFpgaDeconv - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/common/config.h b/mobile/src/fpga/common/config.h deleted file mode 100644 index 27187c7b85..0000000000 --- a/mobile/src/fpga/common/config.h +++ /dev/null @@ -1,18 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#define PADDLE_MOBILE_ZU5 -#define FPGA_PRINT_MODE diff --git a/mobile/src/fpga/common/driver.cpp b/mobile/src/fpga/common/driver.cpp deleted file mode 100755 index b7ce4d3247..0000000000 --- a/mobile/src/fpga/common/driver.cpp +++ /dev/null @@ -1,296 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common/enforce.h" -#include "fpga/common/driver.h" - -namespace paddle_mobile { -namespace fpga { -namespace driver { -struct FPGA_INFO g_fpgainfo; - -int open_drvdevice() { - if (g_fpgainfo.fd_drv == -1) { - g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR); - } - return g_fpgainfo.fd_drv; -} - -int open_memdevice() { - if (g_fpgainfo.fd_mem == -1) { - // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC); - g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR); - } - return g_fpgainfo.fd_mem; -} - -int close_drvdevice() { return close(g_fpgainfo.fd_drv); } - -int close_memdevice() { return close(g_fpgainfo.fd_mem); } - -void pl_reset() { usleep(100 * 1000); } - -void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe, - char const *type_name, int pe_idx) { - memset(pe, 0, sizeof(struct fpga_pe)); - - pe->outer = pe_data; - snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name); - - pe->status = IDLE; - pe->interrupt_cnt = 0; - pe_data->pes[pe_idx] = pe; - pe_data->pe_num++; -} - -void pl_init() { - struct pe_data_s *pe_data = nullptr; - - pl_reset(); - - pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s)); - if (pe_data == nullptr) { - std::cout << "pe_data malloc error!" << std::endl; - return; - } - memset(pe_data, 0, sizeof(struct pe_data_s)); - pthread_mutex_init(&pe_data->mutex, 0); - - setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV); - setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING); - setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW); - setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS); - - g_fpgainfo.pe_data = pe_data; -} - -void pl_destroy() { - struct pe_data_s *pe_data = g_fpgainfo.pe_data; - pthread_mutex_destroy(&pe_data->mutex); - free(pe_data); -} - -void pl_start() { - struct pe_data_s *pe_data = g_fpgainfo.pe_data; - - pthread_mutex_unlock(&pe_data->mutex); -} - -void pl_stop() { - struct pe_data_s *pe_data = g_fpgainfo.pe_data; - - pthread_mutex_lock(&pe_data->mutex); -} - -void pl_reinit() { - struct pe_data_s *pe_data = g_fpgainfo.pe_data; - struct fpga_pe *pe = nullptr; - int i = 0; - - pl_stop(); - pl_reset(); - pl_start(); - - for (i = 0; i < pe_data->pe_num; i++) { - pe = pe_data->pes[i]; - pe->status = IDLE; - pe->interrupt_cnt = 0; - } - - pl_start(); -} - -int pl_get_status() { return 0; } - -/*tmie单位us*/ -int fpga_regpoll(uint64_t reg, uint64_t val, int time) { - uint64_t i = 0; - /*timeout精确性待确认*/ - int64_t timeout = time * 6; - - for (i = 0; i < timeout; i++) { - usleep(1); - if (val == reg_readq(reg)) { - break; - } - } - - if (i < timeout) { - return 0; - } else { - return -1; - } -} - -uint64_t vaddr_to_paddr_driver(void *address) { - uint64_t paddr = 0; - auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address); - if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { - paddr = iter->second; - } else { - std::cout << "Invalid pointer: " << address << std::endl; - } - - return paddr; -} - -void *fpga_reg_malloc(size_t size) { - void *ret = nullptr; - ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, - g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR); - // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); - - g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size)); - - return ret; -} - -void *fpga_reg_free(void *ptr) { - size_t size = 0; - - auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr); - if (iter != g_fpgainfo.fpga_addr2size_map.end()) { - size = iter->second; - g_fpgainfo.fpga_addr2size_map.erase(iter); - munmap(ptr, size); - } else { - std::cout << "Invalid pointer" << ptr << std::endl; - } -} - -static inline int do_ioctl(int64_t req, const void *arg) { - return ioctl(g_fpgainfo.fd_mem, req, arg); -} - -void *fpga_malloc_driver(size_t size) { - void *ret = nullptr; - uint64_t phy_addr = 0; - int i = 0; - struct MemoryVM2PHYArgs args; - struct MemoryCacheArgs args_c; - ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, - g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR); - PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); - - args.pVM = reinterpret_cast(ret); - args.pPHY = reinterpret_cast(0); - do_ioctl(IOCTL_MEMORY_VM2PHY, &args); - phy_addr = (uint64_t)args.pPHY; - - g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr)); - g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size)); - - return ret; -} - -void fpga_free_driver(void *ptr) { - size_t size = 0; - uint32_t pos = 0; - uint64_t p_addr = 0; - - auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr); - if (iter != g_fpgainfo.fpga_addr2size_map.end()) { - size = iter->second; - g_fpgainfo.fpga_addr2size_map.erase(iter); - munmap(ptr, size); - auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr); - if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { - g_fpgainfo.fpga_vaddr2paddr_map.erase(iter); - } - } else { - std::cout << "Invalid pointer" << ptr << std::endl; - } -} - -int fpga_flush_driver(void *address, size_t size) { - struct MemoryCacheArgs args; - uint64_t p_addr; - - p_addr = vaddr_to_paddr_driver(address); - - args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT - args.size = size; - - return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); -} - -int fpga_invalidate_driver(void *address, size_t size) { - struct MemoryCacheArgs args; - uint64_t p_addr; - - p_addr = vaddr_to_paddr_driver(address); - - args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT - args.size = size; - - return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); -} - -void fpga_copy_driver(void *dest, const void *src, size_t num) { - uint64_t i; - for (i = 0; i < num; i++) { - *((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT - } - - return; -} - -int open_device_driver() { - g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR; - g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR; - g_fpgainfo.FpgaRegVirAddr = nullptr; - g_fpgainfo.pe_data = nullptr; - g_fpgainfo.drvdevice_path = "/dev/fpgadrv0"; - g_fpgainfo.memdevice_path = "/dev/fpgamem0"; - g_fpgainfo.fd_drv = -1; - g_fpgainfo.fd_mem = -1; - - int ret = 0; - ret = open_drvdevice(); - ret |= open_memdevice(); - - g_fpgainfo.FpgaRegVirAddr = - (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT - pl_init(); - return ret; -} - -int close_device_driver() { - pl_destroy(); - fpga_reg_free(g_fpgainfo.FpgaRegVirAddr); - int ret = 0; - ret = close_drvdevice(); - ret |= close_memdevice(); - return ret; -} - -} // namespace driver -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/common/driver.h b/mobile/src/fpga/common/driver.h deleted file mode 100644 index 87c68cbb5a..0000000000 --- a/mobile/src/fpga/common/driver.h +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "common/log.h" - -namespace paddle_mobile { -namespace fpga { -namespace driver { - -#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) - -#define FPGA_REG_PHY_ADDR 0x80000000 -#define FPGA_REG_SIZE 0x1000 -#define FPGA_MEM_PHY_ADDR 0x20000000 -#define FPGA_MEM_SIZE 0x20000000 - -#define FPGA_PAGE_SIZE (16UL * 1024UL) - -// PE related macros -const int MAX_NUM_PES = 6; -const size_t MAX_TYPE_NAME_LENTH = 8; - -const int PE_IDX_CONV = 0; -const int PE_IDX_POOLING = 1; -const int PE_IDX_EW = 2; -const int PE_IDX_BYPASS = 3; - -enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 }; - -struct MemoryCacheArgs { - void *offset; - size_t size; -}; - -struct MemoryVM2PHYArgs { - void *pVM; - void *pPHY; -}; - -#define IOCTL_FPGA_MAGIC 'F' -#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) -#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) -#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs) - -struct fpga_pe { - char type_name[MAX_TYPE_NAME_LENTH + 1]; - struct pe_data_s *outer; - pe_status status; - uint64_t interrupt_cnt; -}; - -struct pe_data_s { - pthread_mutex_t mutex; - struct fpga_pe pe_conv; - struct fpga_pe pe_pooling; - struct fpga_pe pe_ew; - struct fpga_pe pe_bypass; - - struct fpga_pe *pes[MAX_NUM_PES]; - int pe_num; -}; - -struct fpga_memory { - pthread_mutex_t mutex; - uint64_t *bitmap; - unsigned int *nr; - unsigned int page_num; - unsigned int page_num_long; - uint64_t mem_start; - uint64_t mem_end; -}; - -struct FPGA_INFO { - uint64_t FpgaRegPhyAddr; - uint64_t FpgaMemPhyAddr; - pthread_t poll_pid; - void *FpgaRegVirAddr; - struct pe_data_s *pe_data; - - std::map fpga_addr2size_map; - std::map fpga_vaddr2paddr_map; - const char *drvdevice_path; - const char *memdevice_path; - struct fpga_memory *memory_info; - int fd_drv; - int fd_mem; -}; - -extern struct FPGA_INFO g_fpgainfo; - -inline uint64_t reg_readq(uint32_t offset) { - uint64_t value = - *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT - offset); // NOLINT - return value; -} - -inline void reg_writeq(uint64_t value, uint32_t offset) { - *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT - offset) = value; -} - -int open_device_driver(); - -int close_device_driver(); - -void *fpga_malloc_driver(size_t size); - -void fpga_free_driver(void *ptr); - -int fpga_flush_driver(void *address, size_t size); - -int fpga_invalidate_driver(void *address, size_t size); - -uint64_t vaddr_to_paddr_driver(void *address); - -int fpga_regpoll(uint64_t reg, uint64_t val, int time); - -} // namespace driver -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/common/fpga_common.cpp b/mobile/src/fpga/common/fpga_common.cpp deleted file mode 100644 index 2c589b3ef6..0000000000 --- a/mobile/src/fpga/common/fpga_common.cpp +++ /dev/null @@ -1,214 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/common/fpga_common.h" -#include -#include -#include -#include "fpga/common/config.h" -#include "fpga/common/driver.h" - -namespace paddle_mobile { -namespace fpga { - -int16_t fp32_2_fp16(float fp32_num) { - int32_t tmp = *(reinterpret_cast(&fp32_num)); - int16_t se_fp32 = (tmp >> 23) & 0x1ff; - int32_t m_fp32 = tmp & 0x007fffff; - int16_t se_fp16 = 0; - int16_t m_fp16 = 0; - - if (se_fp32 < 103) { - se_fp16 = 0x0000; - m_fp16 = m_fp32 >> 24; - } else if (se_fp32 < 113) { - se_fp16 = (0x0400 >> (113 - se_fp32)); - m_fp16 = m_fp32 >> (126 - se_fp32); - } else if (se_fp32 <= 142) { - se_fp16 = (se_fp32 - 112) << 10; - m_fp16 = m_fp32 >> 13; - } else if (se_fp32 < 255) { - se_fp16 = 0x7C00; - m_fp16 = m_fp32 >> 24; - } else if (se_fp32 == 255) { - se_fp16 = 0x7C00; - m_fp16 = m_fp32 >> 13; - } else if (se_fp32 < 359) { - se_fp16 = 0x8000; - m_fp16 = m_fp32 >> 24; - } else if (se_fp32 < 369) { - se_fp16 = (0x0400 >> (369 - se_fp32)) | 0x8000; - m_fp16 = m_fp32 >> (382 - se_fp32); - } else if (se_fp32 <= 398) { - se_fp16 = ((se_fp32 - 368) << 10) | 0x8000; - m_fp16 = m_fp32 >> 13; - } else if (se_fp32 < 511) { - se_fp16 = 0x7C00; - m_fp16 = m_fp32 >> 24; - } else { - se_fp16 = 0x7C00; - m_fp16 = m_fp32 >> 13; - } - int16_t result = se_fp16 + m_fp16; - return result; -} - -int32_t convertmantissa(int32_t i) { - int32_t m = i << 13; - int32_t e = 0; - while (!(m & 0x00800000)) { - e -= 0x00800000; - m <<= 1; - } - m &= ~0x00800000; - e += 0x38800000; - return m | e; -} - -float fp16_2_fp32(int16_t fp16_num) { - int16_t se_fp16 = (fp16_num >> 10) & 0x3f; - int16_t m_fp16 = fp16_num & 0x3ff; - int32_t e_fp32 = 0; - int16_t offset = 0; - int32_t m_fp32 = 0; - if (se_fp16 == 0) { - e_fp32 = 0; - offset = 0; - } else if (se_fp16 < 31) { - e_fp32 = se_fp16 << 23; - offset = 1024; - } else if (se_fp16 == 31) { - e_fp32 = 0x47800000; - offset = 1024; - } else if (se_fp16 == 32) { - e_fp32 = 0x80000000; - offset = 0; - } else if (se_fp16 < 63) { - e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23); - offset = 1024; - } else { // se_fp16 == 63 - e_fp32 = 0xC7800000; - offset = 1024; - } - int16_t a = offset + m_fp16; - if (a == 0) { - m_fp32 = 0; - } else if (a < 1024) { - int32_t tmp = a; - m_fp32 = convertmantissa(tmp); - } else { - int32_t tmp = a - 1024; - m_fp32 = 0x38000000 + (tmp << 13); - } - - int32_t tmp = e_fp32 + m_fp32; - float fp32_num = *(reinterpret_cast(&tmp)); - return fp32_num; -} - -static std::map memory_map; - -int open_device() { - int ret = driver::open_device_driver(); - return ret; -} - -int close_device() { - int ret = driver::close_device_driver(); - return ret; -} - -void *fpga_malloc(size_t size) { - static uint64_t counter = 0; - if (size <= 0) { - size = 1; - } -#ifdef PADDLE_MOBILE_ZU5 - auto ptr = driver::fpga_malloc_driver(size); -#else - auto ptr = malloc(size); -#endif - counter += size; - memory_map.insert(std::make_pair(ptr, size)); - // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " - // << counter << " bytes"; - return ptr; -} - -void fpga_free(void *ptr) { - if (ptr == nullptr) { - return; - } - static uint64_t counter = 0; - size_t size = 0; - auto iter = memory_map.find(ptr); // std::map::iterator - if (iter != memory_map.end()) { - size = iter->second; - memory_map.erase(iter); -#ifdef PADDLE_MOBILE_ZU5 - driver::fpga_free_driver(ptr); -#else - free(ptr); -#endif - counter += size; - // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " - // << counter << " bytes"; - } else { - DLOG << "Address: " << ptr << " Invalid pointer"; - } -} -void fpga_copy(void *dest, const void *src, size_t num) { -#ifdef PADDLE_MOBILE_ZU5 - // driver::fpga_copy_driver(dest, src, num); - memcpy(dest, src, num); -#else - memcpy(dest, src, num); -#endif -} - -int fpga_flush(void *address, size_t size) { -#ifdef PADDLE_MOBILE_ZU5 - return driver::fpga_flush_driver(address, size); -#else - return 0; -#endif -} -int fpga_invalidate(void *address, size_t size) { -#ifdef PADDLE_MOBILE_ZU5 - return driver::fpga_invalidate_driver(address, size); -#else - return 0; -#endif -} -uint64_t vaddr_to_paddr(void *address) { -#ifdef PADDLE_MOBILE_ZU5 - return driver::vaddr_to_paddr_driver(address); -#else - return 0; -#endif -} - -uint32_t paddle_mobile_version() { - uint32_t v_master = 52; - uint32_t v_slave = 52; - - uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 1; - uint32_t master = first << 24 | second << 16 | v_master << 8 | fourth_master; - uint32_t slave = first << 24 | second << 16 | v_slave << 8 | fourth_slave; - - return slave; -} - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/common/fpga_common.h b/mobile/src/fpga/common/fpga_common.h deleted file mode 100755 index a767cd2606..0000000000 --- a/mobile/src/fpga/common/fpga_common.h +++ /dev/null @@ -1,331 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#ifdef PADDLE_MOBILE_FPGA_V1 -#define IMAGE_ALIGNMENT (16) // Aligned to 16 -#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32 -#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 -#define BS_NUM_ALIGNMENT (8) -#define BIAS_NUM_ALIGNMENT (16) -#define ROW_PARALLEL_NUM (2) -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#define IMAGE_ALIGNMENT (32) // Aligned to 32 -#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32 -#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 -#define BS_NUM_ALIGNMENT (8) -#define BIAS_SCALE_DMA_NUM (4) -#define RESULT_ALIGNMENT (32) - -#define PE_COLUMN (8) -#define ROW_PARALLEL_NUM (2) - -#define BIAS_NUM_ALIGNMENT (16) - -#endif - -namespace paddle_mobile { -namespace fpga { - -enum DataType { - DATA_TYPE_INT8 = 2, - DATA_TYPE_FP32 = 1, - DATA_TYPE_FP16 = 0, -}; - -enum LayoutType { - LAYOUT_CHW = 1, - LAYOUT_HWC = 0, -}; - -enum ActivationType { - NONE = 0, - LEAKYRELU = 1, - SIGMOID = 2, - TANH = 3, - SOFTMAX = 4, -}; - -struct ActivationArgs { - enum ActivationType activation_type = NONE; - int16_t leaky_relu_negative_slope; -}; - -struct KernelArgs { - uint32_t width; - uint32_t height; - uint32_t stride_w; - uint32_t stride_h; -}; - -struct ImageInputArgs { - void* address; // input featuremap virtual address - float* scale_address; // input scale address; - uint32_t channels; - uint32_t width; // featuremap width - uint32_t height; - uint32_t pad_width; // padding width; - uint32_t pad_height; -}; - -struct ImageOutputArgs { - void* address; // output result address; - float* scale_address; // output scale address; - uint64_t timer_cnt; // time counter for FPGA computation - struct ActivationArgs - activation; // To select activation and specify (Leaky)Relu parameter. -}; - -// #ifdef PADDLE_MOBILE_FPGA_V1 -struct ConvDriverParam { - uint64_t filter_per_group; - uint64_t channel_per_group; - uint64_t image_one_pad_per_row; - uint64_t deconv_param; - - // new - uint64_t col_padding_up; - uint64_t col_padding_down; - uint64_t row_padding_up; - uint64_t row_padding_down; - - uint64_t image_block_amount_per_row; - uint64_t filter_pad_width_mul_channel; - uint64_t image_win_cnt; - uint64_t image_win_cnt_last; - - uint64_t filter_row; - uint64_t filter_width; - uint64_t filter_height; - uint64_t skip_window; - uint64_t stride_h; - - uint64_t filter_amount_all; - uint64_t prog_full_cnt; - uint64_t filter_align; - uint64_t filter_num; - - uint64_t output_width; - uint64_t output_amount_per_row; - uint64_t res_row_data_align4_pad; - uint64_t cal_res_num; - uint64_t last_cal_res_row_num; - uint64_t post_prog_full_cnt; - - uint64_t deconv_skip_row; // paralvl*deconv_group - uint64_t deconv_res_skip_row; // deconv_group * result_amount_per_row - uint64_t deconv_ena; - uint64_t deconv_dump; - - uint64_t output_address_phy; - uint64_t output_height; - uint64_t result_amount_per_row_multi_para; - - uint64_t sb_address_phy; - uint64_t fpga_bias_scale_len; - uint64_t filter_amount_whole; - - uint64_t filter_address_phy; - uint64_t filters_amount_whole; - - uint64_t image_address_phy; - uint64_t image_hight; - uint64_t image_amount_per_row; - - uint64_t image_amount_per_row_multi_win_first; - uint64_t image_amount_per_row_multi_win; - uint64_t filter_pad_hight; - - uint64_t image_block_num; - uint64_t image_block_len; - uint64_t image_block_len_last; - - uint64_t cmd; -}; - -struct EWAddDriverParam { - uint64_t image0_address_phy; - uint64_t image1_address_phy; - uint64_t datalen; - uint64_t image_image_pixel; - uint64_t image_amount_per_row; - uint64_t output_address_phy; - uint64_t coefficient; - uint64_t cmd; -}; - -struct DeconvTxParm { - uint32_t omit_size; - uint32_t sub_conv_num; - uint32_t deconv_en; - uint32_t out_addr_offset; -}; - -struct ConvArgs { - bool relu_enabled; - void* sb_address; // scale and bias - void* filter_address; - float* filter_scale_address; - uint32_t filter_num; - uint32_t group_num; - - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; - - // #ifdef PADDLE_MOBILE_FPGA_V1 - struct DeconvTxParm deconv_tx_param; - struct ConvDriverParam driver; -}; - -struct ConcatArgs { - uint32_t image_num; -#ifdef PADDLE_MOBILE_FPGA_V2 - int8_t** images_in; -#else - int16_t** images_in; -#endif - float** scales_in; - void* image_out; - float* scale_out; - uint32_t* channel_num; - uint32_t* aligned_channel_num; // Not used so far. Reserved for V2. - uint32_t out_channel; - uint32_t height; - uint32_t width; - std::vector> vector_concat_space; -}; - -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_arg; - struct ConcatArgs concat_arg; - std::shared_ptr shared_conv_arg; - std::vector> vector_concat_space; - std::vector> vector_conv_space; -}; - -struct SplitArgs { - uint32_t image_num; -#ifdef PADDLE_MOBILE_FPGA_V2 - int8_t* image_in; -#else - int16_t* image_in; -#endif - float* scale_in; - void** images_out; - float** scales_out; - uint32_t* out_channel_nums; - uint32_t height; - uint32_t width; - std::vector> vector_split_space; -}; - -struct PoolingArgs { - int16_t mode; // mode: 0:max, 1:avg - int16_t kernel_reciprocal; - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct EWAddArgs { - bool relu_enabled; - uint32_t const0; // output0 = const0 x input0 + const1 x input1; - uint32_t const1; - struct ImageInputArgs image0; - struct ImageInputArgs image1; - struct ImageOutputArgs output; - // #ifdef PADDLE_MOBILE_FPGA_V1 - struct EWAddDriverParam driver; -}; - -struct BypassArgs { - enum DataType input_data_type; - enum DataType output_data_type; - enum LayoutType input_layout_type; - enum LayoutType output_layout_type; - struct ImageInputArgs image; - struct ImageOutputArgs output; -}; - -struct DeconvArgs { - uint32_t sub_conv_num; - uint32_t group_num; - uint32_t filter_num; - uint32_t omit_size; - uint32_t sub_output_width; - uint32_t sub_output_height; - struct ImageOutputArgs output; - std::vector> split_conv_args; -}; -struct DWconvArgs { - uint32_t sub_conv_num; - bool relu_enabled; - void* bias_address; - void* filter_address; - struct KernelArgs kernel; - struct ImageInputArgs image; - struct ImageOutputArgs output; - std::vector> vector_dwconv_space; -}; - -struct DWDeconvArgs { - uint32_t sub_conv_num; - uint32_t group_num; - uint32_t filter_num; - uint32_t omit_size; - uint32_t sub_output_width; - uint32_t sub_output_height; - struct ImageOutputArgs output; - std::vector> dw_conv_args; - std::vector> vector_dw_conv_space; -}; - -// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; -// } -static inline uint32_t align_to_x(int64_t num, int64_t x) { - return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x; -} - -int16_t fp32_2_fp16(float fp32_num); -float fp16_2_fp32(int16_t fp16_num); - -int open_device(); -int close_device(); -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); -void fpga_copy(void* dest, const void* src, size_t num); -int fpga_flush(void* address, size_t size); -int fpga_invalidate(void* address, size_t size); - -uint64_t vaddr_to_paddr(void* address); -void expand_conv_arg(ConvArgs* arg); -void expand_EW_arg(EWAddArgs* arg); -inline int32_t convertmantissa(int32_t i); - -uint32_t paddle_mobile_version(); - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/fpga/common/pe.h b/mobile/src/fpga/common/pe.h deleted file mode 100644 index cf0574bc04..0000000000 --- a/mobile/src/fpga/common/pe.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include "fpga/common/fpga_common.h" - -namespace paddle_mobile { -namespace fpga { - -uint64_t FPGAVersion(); -int PerformBypass(const struct BypassArgs& args); -int ComputeBasicConv(const struct ConvArgs& args); -int ComputeFpgaPool(const struct PoolingArgs& args); -int ComputeFpgaEWAdd(const struct EWAddArgs& args); - -int ComputeFpgaConv(const struct SplitConvArgs& args); -int ComputeFPGAConcat(const struct ConcatArgs& args); -int ComputeFPGASplit(const struct SplitArgs& args); -int ComputeFpgaDeconv(const struct DeconvArgs& args); -int ComputeDWConv(const struct DWconvArgs& args); -int ComputeDWDeconv(const struct DWDeconvArgs& args); - -} // namespace fpga -} // namespace paddle_mobile diff --git a/mobile/src/framework/CMakeLists.txt b/mobile/src/framework/CMakeLists.txt deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mobile/src/framework/attribute.cpp b/mobile/src/framework/attribute.cpp deleted file mode 100644 index 8b150f4e9e..0000000000 --- a/mobile/src/framework/attribute.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "attribute.h" - -namespace paddle_mobile { -namespace framework { - -struct PrintVistor : Vistor { - explicit PrintVistor(Print &printer) : printer_(printer) {} - template - Print &operator()(const T &value) { - printer_ << value; - return printer_; - } - - private: - Print &printer_; -}; - -Print &operator<<(Print &printer, const Attribute &attr) { - Attribute::ApplyVistor(PrintVistor(printer), attr); - // std::vector v = {"1", "2"}; - // printer << (v); - return printer; -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/attribute.h b/mobile/src/framework/attribute.h deleted file mode 100644 index ece55f99b6..0000000000 --- a/mobile/src/framework/attribute.h +++ /dev/null @@ -1,183 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "common/enforce.h" -#include "common/log.h" -#include "common/variant.h" -#include "framework/framework.pb-c.h" - -namespace paddle_mobile { -namespace framework { -using std::string; -using std::vector; - -class BlockDesc; - -class Attribute { - public: - static Attribute GetAttrValue( - PaddleMobile__Framework__Proto__OpDesc__Attr *attr_desc) { - Attribute attr; - switch (attr_desc->type) { - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN: { - attr.Set(attr_desc->b); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT: { - attr.Set(attr_desc->i); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT: { - attr.Set(attr_desc->f); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: { - attr.Set(attr_desc->s); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: { - vector val(attr_desc->n_bools); - for (int i = 0; i < attr_desc->n_bools; ++i) { - val[i] = attr_desc->bools[i]; - } - attr.Set>(val); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS: { - vector val(attr_desc->n_ints); - for (int i = 0; i < attr_desc->n_ints; ++i) { - val[i] = attr_desc->ints[i]; - } - attr.Set>(val); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS: { - vector val(attr_desc->n_floats); - for (int i = 0; i < attr_desc->n_floats; ++i) { - val[i] = attr_desc->floats[i]; - } - attr.Set>(val); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS: { - vector val(attr_desc->n_strings); - for (int i = 0; i < attr_desc->n_strings; ++i) { - val[i] = attr_desc->strings[i]; - } - attr.Set>(val); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG: { - attr.Set(attr_desc->l); - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK: { - break; - } - case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS: { - vector val(attr_desc->n_longs); - for (int i = 0; i < attr_desc->n_longs; ++i) { - val[i] = attr_desc->longs[i]; - } - attr.Set>(val); - break; - } - default: - PADDLE_MOBILE_THROW_EXCEPTION("attr type not support"); - } - return attr; - } - - Attribute() {} - template - Attribute &Set(Args &&... args) { - variant_.Set(args...); - return *this; - } - - template - T &Get() const { - return variant_.Get(); - } - - std::string GetString() const { return variant_.Get(); } - - template - static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) { - if (attr.variant_.TypeId() == type_id()) { // NOLINT - return vistor(attr.variant_.Get()); - } else if (attr.variant_.TypeId() == type_id()) { // NOLINT - return vistor(attr.variant_.Get()); - } else if (attr.variant_.TypeId() == type_id()) { - return vistor(attr.variant_.Get()); - } else if (attr.variant_.TypeId() == type_id>()) { - return vistor(attr.variant_.Get>()); - } else if (attr.variant_.TypeId() == type_id>()) { - return vistor(attr.variant_.Get>()); - } else if (attr.variant_.TypeId() == type_id>()) { - return vistor(attr.variant_.Get>()); - } else if (attr.variant_.TypeId() == type_id()) { // NOLINT - return vistor(attr.variant_.Get()); - } else if (attr.variant_.TypeId() == type_id>()) { - return vistor(attr.variant_.Get>()); - } else if (attr.variant_.TypeId() == type_id()) { - return vistor(attr.variant_.Get()); - } else if (attr.variant_.TypeId() == type_id()) { - return vistor(attr.variant_.Get()); - } else if (attr.variant_.TypeId() == - type_id>()) { - return vistor(attr.variant_.Get>()); - } else if (attr.variant_.TypeId() == type_id>()) { - return vistor(attr.variant_.Get>()); - } else { - PADDLE_MOBILE_THROW_EXCEPTION("type not support"); - } - } - - private: - Variant, vector, vector, bool, - vector, BlockDesc *, vector, int64_t, - vector> - variant_; -}; - -using AttributeMap = std::unordered_map; - -class AttrReader { - public: - explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {} - - template - inline T Get(const string &name) const { - PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0, - "%s should be in AttributeMap", name.c_str()); - return ((Attribute)attrs_.at(name)).Get(); - } - - private: - const AttributeMap &attrs_; -}; - -Print &operator<<(Print &printer, const Attribute &op_desc); - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_deleter.h b/mobile/src/framework/cl/cl_deleter.h deleted file mode 100644 index 731e5de663..0000000000 --- a/mobile/src/framework/cl/cl_deleter.h +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "CL/cl.h" -#include "common/log.h" -struct CLKernelDeleter { - template - void operator()(T *clKernelObj) { - const cl_int status = clReleaseKernel(clKernelObj); - LOG(paddle_mobile::kNO_LOG) << "clReleaseKernel status: " << status; - } -}; - -struct CLMemDeleter { - template - void operator()(T *clMemObj) { - const cl_int status = clReleaseMemObject(clMemObj); - LOG(paddle_mobile::kNO_LOG) << "CLMemDeleter status: " << status; - } -}; - -struct CLEventDeleter { - template - void operator()(T *clEventObj) { - const cl_int status = clReleaseEvent(clEventObj); - LOG(paddle_mobile::kNO_LOG) << "CLEventDeleter status: " << status; - } -}; - -struct CLCommQueueDeleter { - template - void operator()(T *clQueueObj) { - const cl_int status = clReleaseCommandQueue(clQueueObj); - LOG(paddle_mobile::kNO_LOG) << "CLCommQueueDeleter status: " << status; - } -}; - -struct CLContextDeleter { - template - void operator()(T *clContextObj) { - const cl_int status = clReleaseContext(clContextObj); - LOG(paddle_mobile::kNO_LOG) << "CLContextDeleter status: " << status; - } -}; - -struct CLProgramDeleter { - template - void operator()(T *clProgramObj) { - const cl_int status = clReleaseProgram(clProgramObj); - LOG(paddle_mobile::kNO_LOG) << "CLProgramDeleter status: " << status; - } -}; diff --git a/mobile/src/framework/cl/cl_engine.cpp b/mobile/src/framework/cl/cl_engine.cpp deleted file mode 100644 index e8a8361eac..0000000000 --- a/mobile/src/framework/cl/cl_engine.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/cl/cl_engine.h" -#include "CL/cl.h" -#include "framework/cl/cl_tool.h" - -#include -#include - -namespace paddle_mobile { -namespace framework { - -bool CLEngine::Init() { - LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init()"; - if (initialized_) { - return true; - } - LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init() ..."; - cl_int status; - bool is_setplatform_success = SetPlatform(); - bool is_setcldeviceid_success = SetClDeviceId(); - is_init_success_ = is_setplatform_success && is_setcldeviceid_success; - initialized_ = true; - return initialized_; - // setClCommandQueue(); - // std::string filename = "./HelloWorld_Kernel.cl"; - // loadKernelFromFile(filename.c_str()); - // buildProgram(); -} - -CLEngine *CLEngine::Instance() { - static CLEngine cl_engine_; - cl_engine_.Init(); - return &cl_engine_; -} - -bool CLEngine::isInitSuccess() { return is_init_success_; } -bool CLEngine::SetPlatform() { - platform_ = NULL; // the chosen platform - cl_uint numPlatforms; // the NO. of platforms - cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); - if (status != CL_SUCCESS) { - return false; - } - /**For clarity, choose the first available platform. */ - LOG(paddle_mobile::kNO_LOG) << "numPlatforms: " << numPlatforms; - if (numPlatforms > 0) { - cl_platform_id *platforms = reinterpret_cast( - malloc(numPlatforms * sizeof(cl_platform_id))); - status = clGetPlatformIDs(numPlatforms, platforms, NULL); - platform_ = platforms[0]; - free(platforms); - LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_; - return status == CL_SUCCESS; - } - - return false; -} - -bool CLEngine::SetClDeviceId() { - cl_uint numDevices = 0; - LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_; - cl_int status = - clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); - if (status != CL_SUCCESS) { - return false; - } - LOG(paddle_mobile::kNO_LOG) << "numDevices: " << numDevices; - - if (numDevices > 0) { - status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_, - NULL); - LOG(paddle_mobile::kNO_LOG) << "devices_[0]" << devices_[0]; - return status == CL_SUCCESS; - } - return false; -} -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h deleted file mode 100644 index 2a6362ebc0..0000000000 --- a/mobile/src/framework/cl/cl_engine.h +++ /dev/null @@ -1,256 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "CL/cl.h" -#include "common/enforce.h" -#include "common/log.h" -#include "framework/cl/cl_deleter.h" -#include "framework/cl/cl_tool.h" - -namespace paddle_mobile { -namespace framework { - -class CLLocalWorkSizeInfo { - public: - CLLocalWorkSizeInfo() { - max_work_group_size = 0; - max_work_item_size0 = 0; - max_work_item_size1 = 0; - max_work_item_size2 = 0; - } - CLLocalWorkSizeInfo(size_t total_size, size_t size0, size_t size1, - size_t size2) { - max_work_group_size = total_size; - max_work_item_size0 = size0; - max_work_item_size1 = size1; - max_work_item_size2 = size2; - } - bool isEmpty() { - return max_work_group_size == 0 && max_work_item_size0 == 0 && - max_work_item_size1 == 0 && max_work_item_size2 == 0; - } - - // max total number of work-items in the work-group - size_t max_work_group_size; - // max number of work-items in local_work_size in dim 0 - size_t max_work_item_size0; - // max number of work-items in local_work_size in dim 1 - size_t max_work_item_size1; - // max number of work-items in local_work_size in dim 2 - size_t max_work_item_size2; -}; -inline void ctx_info(const char *errinfo, const void *private_info, size_t cb, - void *user_data) { - fprintf(stderr, "OpenCL Error (via pfn_notify): %s\n", errinfo); -} -class CLEngine { - public: - static CLEngine *Instance(); - - bool Init(); - bool isInitSuccess(); - - std::shared_ptr<_cl_context> CreateContext() { - DLOG << "CreateContext ---"; - DLOG << "platform: " << platform_; - DLOG << "devices_[0]: " << devices_[0]; - - cl_int status; - cl_context c = clCreateContext(NULL, 1, devices_, &ctx_info, NULL, &status); - std::shared_ptr<_cl_context> context(c, CLContextDeleter()); - CL_CHECK_ERRORS(status); - return std::move(context); - } - - std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue( - cl_context context) { - cl_int status; - cl_command_queue queue = - clCreateCommandQueue(context, devices_[0], 0, &status); - std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr( - queue); - CL_CHECK_ERRORS(status); - return std::move(command_queue_ptr); - } - - cl_context getContext() { - if (context_.get() == nullptr) { - context_ = CreateContext(); - } - return context_.get(); - } - - cl_command_queue getClCommandQueue() { - if (command_queue_.get() == nullptr) { - command_queue_ = CreateClCommandQueue(getContext()); - } - return command_queue_.get(); - } - - CLLocalWorkSizeInfo getLocalWorkSizeInfo() { - if (!localWorkSizeInfo_.isEmpty()) { - return localWorkSizeInfo_; - } - cl_int status; - size_t max_work_group_size = 0; - status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, - sizeof(size_t), &max_work_group_size, NULL); - if (status != CL_SUCCESS) { - return CLLocalWorkSizeInfo(0, 0, 0, 0); - } - cl_uint max_dims_num = 0; - status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, - sizeof(cl_uint), &max_dims_num, NULL); - if (status != CL_SUCCESS) { - return CLLocalWorkSizeInfo(0, 0, 0, 0); - } - DLOG << "max_work_item_sizes max_dims_num: " << max_dims_num; - size_t *max_work_item_sizes = - reinterpret_cast(calloc(max_dims_num, sizeof(size_t))); - size_t ret_size = 0; - status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, - max_dims_num * sizeof(size_t), max_work_item_sizes, - &ret_size); - if (status != CL_SUCCESS || ret_size / sizeof(size_t) < 3) { - return CLLocalWorkSizeInfo(0, 0, 0, 0); - } - DLOG << " max_work_item_sizes {" << max_work_item_sizes[0] << ", " - << max_work_item_sizes[1] << ", " << max_work_item_sizes[2] << "}"; - - localWorkSizeInfo_ = - CLLocalWorkSizeInfo(max_work_group_size, max_work_item_sizes[0], - max_work_item_sizes[1], max_work_item_sizes[2]); - free(max_work_item_sizes); - return localWorkSizeInfo_; - } - size_t GetKernelWorkSize(cl_kernel kernel) { - cl_int status; - size_t kernel_work_size = 0; - status = - clGetKernelWorkGroupInfo(kernel, devices_[0], CL_KERNEL_WORK_GROUP_SIZE, - sizeof(size_t), &kernel_work_size, NULL); - if (status != CL_SUCCESS) { - return 0; - } - DLOG << "kernel_work_size: " << kernel_work_size; - return kernel_work_size; - } - - std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith( - cl_context context, std::string file_name) { - FILE *file = fopen(file_name.c_str(), "rb"); - PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", - file_name.c_str()); - fseek(file, 0, SEEK_END); - int64_t size = ftell(file); - PADDLE_MOBILE_ENFORCE(size > 0, "size is too small"); - rewind(file); - char *data = new char[size + 1]; - size_t bytes_read = fread(data, 1, size, file); - data[size] = '\0'; - PADDLE_MOBILE_ENFORCE(bytes_read == size, - "read binary file bytes do not match with fseek"); - fclose(file); - - const char *source = data; - size_t sourceSize[] = {strlen(source)}; - cl_program p = - clCreateProgramWithSource(context, 1, &source, sourceSize, &status_); - - DLOG << " cl kernel file name: " << file_name; - DLOG << " source size: " << sourceSize[0]; - CL_CHECK_ERRORS(status_); - - std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p); - - return std::move(program_ptr); - } - - std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWithSource( - cl_context context, const char *source) { - size_t sourceSize[] = {strlen(source)}; - cl_program p = - clCreateProgramWithSource(context, 1, &source, sourceSize, &status_); - - LOG(kLOG_DEBUG4) << " cl kernel from source"; - LOG(kLOG_DEBUG4) << " source size: " << sourceSize[0]; - CL_CHECK_ERRORS(status_); - - std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p); - - return std::move(program_ptr); - } - - std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) { - cl_event event = clCreateUserEvent(context, &status_); - std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event); - CL_CHECK_ERRORS(status_); - return std::move(event_ptr); - } - - bool BuildProgram(cl_program program, const std::string &options = "") { - cl_int status; - std::string path = options + " -cl-fast-relaxed-math"; - - status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0); - - CL_CHECK_ERRORS(status); - - if (status == CL_BUILD_PROGRAM_FAILURE) { - size_t log_size; - clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char *log = reinterpret_cast(malloc(log_size)); - clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), - CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - DLOG << " program build error: " << log; - } - - return status == CL_SUCCESS; - } - - cl_device_id DeviceID(int index = 0) { return devices_[index]; } - - std::string GetCLPath() { return cl_path_; } - void setClPath(std::string cl_path) { cl_path_ = cl_path; } - - private: - CLEngine() { initialized_ = false; } - - bool SetPlatform(); - - bool SetClDeviceId(); - - bool initialized_; - - CLLocalWorkSizeInfo localWorkSizeInfo_; - - cl_int status_; - std::string cl_path_; - bool is_init_success_ = false; - std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_; - std::shared_ptr<_cl_context> context_; - cl_device_id devices_[10]; - cl_platform_id platform_; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_half.cpp b/mobile/src/framework/cl/cl_half.cpp deleted file mode 100644 index 2877289325..0000000000 --- a/mobile/src/framework/cl/cl_half.cpp +++ /dev/null @@ -1,518 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf - -#include "framework/cl/cl_half.h" - -namespace paddle_mobile { -namespace framework { - -static const uint32_t mantissatable[2048] = { - 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, - 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, - 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, - 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, - 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, - 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, - 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, - 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, - 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, - 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, - 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, - 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, - 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, - 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, - 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, - 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, - 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, - 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, - 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, - 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, - 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, - 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, - 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, - 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, - 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, - 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, - 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, - 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, - 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, - 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, - 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, - 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, - 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, - 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, - 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, - 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, - 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, - 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, - 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, - 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, - 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, - 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, - 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, - 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, - 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, - 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, - 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, - 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, - 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, - 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, - 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, - 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, - 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, - 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, - 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, - 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, - 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, - 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, - 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, - 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, - 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, - 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, - 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, - 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, - 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, - 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, - 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, - 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, - 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, - 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, - 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, - 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, - 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, - 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, - 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, - 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, - 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, - 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, - 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, - 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, - 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, - 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, - 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, - 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, - 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, - 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, - 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, - 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, - 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, - 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, - 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, - 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, - 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, - 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, - 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, - 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, - 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, - 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, - 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, - 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, - 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, - 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, - 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, - 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, - 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, - 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, - 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, - 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, - 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, - 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, - 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, - 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, - 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, - 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, - 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, - 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, - 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, - 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, - 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, - 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, - 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, - 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, - 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, - 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, - 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, - 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, - 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, - 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, - 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, - 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, - 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, - 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, - 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, - 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, - 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, - 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, - 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, - 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, - 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, - 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, - 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, - 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, - 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, - 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, - 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, - 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, - 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, - 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, - 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, - 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, - 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, - 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, - 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, - 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, - 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, - 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, - 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, - 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, - 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, - 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, - 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, - 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, - 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, - 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, - 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, - 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, - 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, - 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, - 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, - 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, - 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, - 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, - 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, - 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, - 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, - 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, - 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, - 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, - 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, - 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, - 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, - 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, - 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, - 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, - 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, - 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, - 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, - 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, - 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, - 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, - 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, - 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, - 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, - 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, - 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, - 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, - 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, - 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, - 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, - 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, - 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, - 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, - 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, - 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, - 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, - 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, - 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, - 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, - 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, - 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, - 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, - 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, - 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, - 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, - 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, - 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, - 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, - 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, - 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, - 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, - 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, - 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, - 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, - 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, - 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, - 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, - 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, - 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, - 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, - 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, - 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, - 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, - 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, - 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, - 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, - 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, - 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, - 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, - 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, - 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, - 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, - 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, - 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, - 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, - 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, - 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, - 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, - 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, - 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, - 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, - 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, - 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, - 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, - 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, - 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, - 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, - 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, - 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, - 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, - 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, - 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, - 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, - 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, - 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, - 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, - 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, - 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, - 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, - 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, - 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, - 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, - 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, - 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, - 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, - 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, - 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, - 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, - 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, - 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, - 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, - 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, - 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, - 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, - 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, - 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, - 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, - 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, - 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, - 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, - 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, - 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, - 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, - 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, - 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, - 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, - 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, - 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, - 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, - 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, - 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, - 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, - 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, - 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, - 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, - 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, - 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, - 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, - 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, - 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, - 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, - 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, - 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, - 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, - 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, - 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, - 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, - 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, - 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, - 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, - 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, - 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, - 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, - 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, - 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, - 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, - 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, - 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, - 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, - 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, - 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, - 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, - 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, - 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, - 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, - 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, - 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, - 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, - 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, - 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, - 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, - 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, - 0x387fc000, 0x387fe000}; - -static const uint16_t offsettable[64] = { - 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; - -static const uint32_t exponenttable[64] = { - 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, - 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, - 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, - 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, - 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, - 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, - 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, - 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, - 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, - 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, - 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; - -static const uint16_t basetable[512] = { - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, - 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, - 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, - 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, - 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, - 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, - 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, - 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, - 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, - 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; - -static const uint8_t shifttable[512] = { - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, - 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, - 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; - -half_t Float2Half(float f) { - uint32_t v = *reinterpret_cast(&f); - return basetable[(v >> 23) & 0x1ff] + - ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); -} - -float Half2Float(half_t h) { - uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + - exponenttable[h >> 10]; - return *reinterpret_cast(&v); -} - -void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { - for (int i = 0; i < count; ++i) { - h_array[i] = Float2Half(f_array[i]); - } -} - -void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { - for (int i = 0; i < count; ++i) { - f_array[i] = Half2Float(h_array[i]); - } -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_half.h b/mobile/src/framework/cl/cl_half.h deleted file mode 100644 index 9b05740f1e..0000000000 --- a/mobile/src/framework/cl/cl_half.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -namespace paddle_mobile { -namespace framework { - -typedef uint16_t half_t; - -half_t Float2Half(float f); - -float Half2Float(half_t h); - -void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); - -void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h deleted file mode 100644 index db9aa37ae2..0000000000 --- a/mobile/src/framework/cl/cl_helper.h +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "common/log.h" -#include "framework/cl/cl_deleter.h" -#include "framework/cl/cl_image.h" -#include "framework/cl/cl_scope.h" - -namespace paddle_mobile { -namespace framework { - -class CLHelper { - public: - CLHelper() = default; - - explicit CLHelper(CLScope *scope) : scope_(scope) {} - - void AddKernel(const std::string &kernel_name, const std::string &file_name, - const std::string &options = "") { - LOG(kLOG_DEBUG1) << " begin add kernel "; - auto kernel = scope_->GetKernel(kernel_name, file_name, options); - LOG(kLOG_DEBUG1) << " begin add kernel "; - kernels.emplace_back(std::move(kernel)); - } - - cl_kernel KernelAt(const int index) { - DLOG << " kernel count: " << kernels.size(); - return kernels[index].get(); - } - - cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); } - - cl_context CLContext() { return scope_->Context(); } - - CLLocalWorkSizeInfo LocalWorkSizeInfo() { - return scope_->LocalWorkSizeInfo(); - } - size_t KernelWorkSize(cl_kernel kernel) { - return scope_->KernelWorkSize(kernel); - } - - std::vector DefaultWorkSize(const CLImage &image) { - // n c h w - auto image_dim = image.dims(); - if (image_dim.size() == 4) { - auto n = image_dim[0]; - auto h = image_dim[2]; - auto w = image_dim[3]; - auto image_width = image.ImageWidth(); - size_t work_size_0 = image_width / w; - size_t work_size_1 = w; - size_t work_size_2 = n * h; - return {work_size_0, work_size_1, work_size_2}; - } else if (image_dim.size() == 2) { - auto h = image_dim[0]; - auto w = image_dim[1]; - return {1, image.ImageWidth(), image.ImageHeight()}; - } else if (image_dim.size() == 1) { - return {1, image.ImageWidth(), 1}; - } else if (image_dim.size() == 3) { - size_t c = image_dim[0]; - size_t h = image_dim[1]; - size_t w = image_dim[2]; - return {(c + 3) / 4, w, h}; - } - PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp "); - } - - private: - CLScope *scope_; - std::vector> kernels; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_image.cpp b/mobile/src/framework/cl/cl_image.cpp deleted file mode 100644 index 1b8966742d..0000000000 --- a/mobile/src/framework/cl/cl_image.cpp +++ /dev/null @@ -1,187 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/cl/cl_image.h" -#include "framework/cl/cl_tensor.h" - -namespace paddle_mobile { -namespace framework { - -void CLImage::PrintTensor(const CLImage &cl_image) const { - size_t width = cl_image.ImageDims()[0]; - size_t height = cl_image.ImageDims()[1]; - - half_t *image_data = new half_t[height * width * 4]; - cl_int err; - cl_mem image = cl_image.GetCLImage(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, - region, 0, 0, image_data, 0, NULL, NULL); - - CL_CHECK_ERRORS(err); - - PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0, - "cl_image numel should not be 0 "); - float *tensor_data = new float[cl_image.numel()]; - auto converter = cl_image.Converter(); - converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(), - cl_image.dims()); - int stride = cl_image.numel() / 20; - stride = stride > 0 ? stride : 1; - - for (int i = 0; i < cl_image.numel(); i++) { - printf("%f \n", tensor_data[i]); - } - - delete[](tensor_data); - delete[](image_data); -} - -void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context, - cl_command_queue commandQueue, cl_kernel kernel) { - tensor->mutable_data(); - const auto &dim = cl_image->dims(); - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < dim.size(); ++j) { - new_dims[4 - dim.size() + j] = dim[j]; - } - size_t C, in_height, in_width; - - C = new_dims[1]; - in_height = new_dims[2]; - in_width = new_dims[3]; - - CLTensor out_cl_tensor(context, commandQueue); - out_cl_tensor.Resize(tensor->dims()); - cl_mem outBuffer = out_cl_tensor.mutable_data(); - - auto input_image = cl_image->GetCLImage(); - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(int), &in_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(int), &in_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer); - CL_CHECK_ERRORS(status); - int size_ch = in_height * in_width; - int size_block = size_ch * 4; - int size_batch = size_ch * C; - status = clSetKernelArg(kernel, 4, sizeof(int), &size_ch); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &size_block); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(int), &size_batch); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &C); - CL_CHECK_ERRORS(status); - size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3], - new_dims[0] * new_dims[2]}; - status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL, - global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - memcpy(tensor->data(), out_cl_tensor.Data(), - tensor->memory_size()); -} - -void TensorToCLImage(Tensor *tensor, CLImage *cl_image, cl_context context, - cl_command_queue commandQueue, cl_kernel kernel) { - const auto &dim = cl_image->dims(); - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < dim.size(); ++j) { - new_dims[4 - dim.size() + j] = dim[j]; - } - cl_int status; - auto output = cl_image; - const Tensor *input = tensor; - const float *input_data = input->data(); - auto output_image = output->GetCLImage(); - const int out_C = new_dims[1]; - const int out_H = new_dims[2]; - const int out_W = new_dims[3]; - const int Stride2 = out_C * out_H * out_W; - const int Stride1 = out_H * out_W; - const int Stride0 = out_W; - DLOG << out_C; - DLOG << out_H; - DLOG << out_W; - CLTensor input_cl_tensor(context, commandQueue); - input_cl_tensor.Resize(input->dims()); - cl_mem inputBuffer = input_cl_tensor.mutable_with_data(input_data); - - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); - CL_CHECK_ERRORS(status); - - size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3], - new_dims[0] * new_dims[2]}; - status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL, - global_work_size, NULL, 0, NULL, NULL); - - CL_CHECK_ERRORS(status); -} - -#ifdef PADDLE_MOBILE_DEBUG -Print &operator<<(Print &printer, const CLImage &cl_image) { - size_t width = cl_image.ImageDims()[0]; - size_t height = cl_image.ImageDims()[1]; - - half_t *image_data = new half_t[height * width * 4]; - cl_int err; - cl_mem image = cl_image.GetCLImage(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, - region, 0, 0, image_data, 0, NULL, NULL); - - CL_CHECK_ERRORS(err); - - PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0, - "cl_image numel should not be 0 "); - float *tensor_data = new float[cl_image.numel()]; - auto converter = cl_image.Converter(); - converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(), - cl_image.dims()); - int stride = cl_image.numel() / 20; - stride = stride > 0 ? stride : 1; - - printer << " dims: " << cl_image.dims() << "\n"; - for (int i = 0; i < cl_image.numel(); i += stride) { - printer << tensor_data[i] << " "; - } - - delete[](tensor_data); - delete[](image_data); - - return printer; -} -#endif -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h deleted file mode 100644 index 57656c3c6d..0000000000 --- a/mobile/src/framework/cl/cl_image.h +++ /dev/null @@ -1,338 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "CL/cl.h" - -#include "framework/cl/cl_deleter.h" -#include "framework/cl/cl_engine.h" -#include "framework/cl/cl_half.h" -#include "framework/cl/cl_image_converter.h" -#include "framework/cl/cl_tool.h" -#include "framework/ddim.h" -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace framework { - -class CLImage { - public: - CLImage() = default; - - ~CLImage() { - if (tensor_data_ != nullptr) { - delete[](tensor_data_); - } - - if (image_converter_) { - delete (image_converter_); - } - } - /* - * will not hold input tensor data, memcpy in this method - * */ - void SetTensorData(float *tensorData, const DDim &dim) { - int numel = product(dim); - if (tensor_data_ != nullptr) { - delete[](tensor_data_); - tensor_data_ = nullptr; - } - tensor_data_ = new float[numel]; - memcpy(tensor_data_, tensorData, numel * sizeof(float)); - tensor_dims_ = dim; - } - - bool isInit() { return initialized_; } - /* - * need call SetTensorData first - * - * folder when one dim or two dim - * */ - void InitCLImage(cl_context context, cl_command_queue command_queue) { - PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, - " need call SetTensorData first"); - CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); - InitCLImage(context, command_queue, folder_converter); - } - - void InitNormalCLImage(cl_context context, cl_command_queue command_queue) { - PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, - " need call SetTensorData first"); - CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); - InitCLImage(context, command_queue, normal_converter); - } - - void InitCLImage(cl_context context, cl_command_queue command_queue, - CLImageConverterBase *converter) { - if (image_converter_ != nullptr) { - delete (image_converter_); - } - - PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, - " need call SetTensorData first"); - - LOG(kNO_LOG) << " begin init cl image "; - image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); - - half_t *image_data = new half_t[product(image_dims_) * 4]; - - LOG(kNO_LOG) << " convert to image"; - converter->NCHWToImage(tensor_data_, image_data, tensor_dims_); - LOG(kNO_LOG) << " end convert to image"; - - InitCLImage(context, image_dims_[0], image_dims_[1], image_data); - - delete[](image_data); - delete[](tensor_data_); - - command_queue_ = command_queue; - tensor_data_ = nullptr; - image_converter_ = converter; - initialized_ = true; - LOG(kNO_LOG) << " end init cl image"; - } - - void InitNImage(cl_context context, cl_command_queue command_queue) { - if (tensor_data_ == nullptr) { - PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); - } - CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock(); - InitCLImage(context, command_queue, folder_converter); - PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); - } - void InitDWImage(cl_context context, cl_command_queue command_queue) { - if (tensor_data_ == nullptr) { - PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); - } - CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock(); - InitCLImage(context, command_queue, dw_converter); - PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); - } - - void InitEmptyImage(cl_context context, cl_command_queue command_queue, - const DDim &dim) { - if (image_converter_ != nullptr) { - delete image_converter_; - } - PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, - " empty image tensor data shouldn't have value"); - - // CLImageConverterFolder *folder_converter = new - // CLImageConverterFolder(); - CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); - PADDLE_MOBILE_ENFORCE(!shared_mem_, "do not init mem after shared .") - // LOG(kNO_LOG) << " to get image dims "; - image_dims_ = normal_converter->InitImageDimInfoWith(dim); - // LOG(kNO_LOG) << " end get image dims " << image_dims_; - - InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); - - tensor_dims_ = dim; - command_queue_ = command_queue; - image_converter_ = normal_converter; - cl_event_ = CLEngine::Instance()->CreateEvent(context); - initialized_ = true; - // LOG(kNO_LOG) << " end init cl image"; - } - /** - * create fake size cl_mem for mem share - */ - void InitFakeSizeImage(cl_context context, cl_command_queue command_queue, - const DDim &need_dims, const DDim &real_image_dims) { - PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, - " empty image tensor data shouldn't have value"); - if (image_converter_ != nullptr) { - delete image_converter_; - } - CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); - // use real image dims to create mem - real_image_dims_ = real_image_dims; - // when init fake size image , - // reinit image is allow , it is disallowed after this.. - shared_mem_ = false; - InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr); - // cheat cl_image they got what they wanted - image_dims_ = normal_converter->InitImageDimInfoWith(need_dims); - LOG(kNO_LOG) << "InitFakeSizeImage ... "; - LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; - LOG(kNO_LOG) << "image_dims_: " << image_dims_; - PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] && - real_image_dims_[1] >= image_dims_[1], - "real image is not enough"); - tensor_dims_ = need_dims; - command_queue_ = command_queue; - image_converter_ = normal_converter; - cl_event_ = CLEngine::Instance()->CreateEvent(context); - initialized_ = true; - shared_mem_ = true; - - LOG(kNO_LOG) << " end init FakeSizeImage"; - } - /** - * init cl mem with a exist cl mem - */ - void InitWithExistMem(cl_context context, cl_command_queue command_queue, - DDim need_dims, const CLImage &src) { - if (image_converter_ != nullptr) { - delete image_converter_; - } - CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); - - real_image_dims_ = src.real_image_dims_; - image_dims_ = normal_converter->InitImageDimInfoWith(need_dims); - - LOG(kNO_LOG) << "InitWithExistMem ... "; - LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; - LOG(kNO_LOG) << "image_dims_: " << image_dims_; - - if (real_image_dims_[0] < image_dims_[0] || - real_image_dims_[1] < image_dims_[1]) { - LOG(kNO_LOG) << "real image is not enough!"; - LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; - LOG(kNO_LOG) << "image_dims_: " << image_dims_; - } - PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] && - real_image_dims_[1] >= image_dims_[1], - "real image is not enough!"); - if (cl_image_ != src.cl_image_) { - cl_image_ = src.cl_image_; - } - - tensor_dims_ = need_dims; - command_queue_ = command_queue; - image_converter_ = normal_converter; - cl_event_ = CLEngine::Instance()->CreateEvent(context); - initialized_ = true; - shared_mem_ = true; - - LOG(kNO_LOG) << " end init WithExistMem"; - } - - void InitConv2dTransposeFilterCLImage(cl_context context, - cl_command_queue command_queue) { - PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, - " need call SetTensorData first"); - CLImageConverterConv2dTransposeTransWeight *converter = - new CLImageConverterConv2dTransposeTransWeight(); - InitCLImage(context, command_queue, converter); - } - - cl_mem GetCLImage() const { return cl_image_.get(); } - - const DDim &ImageDims() const { return image_dims_; } - - inline size_t ImageWidth() const { return image_dims_[0]; } - - inline size_t ImageHeight() const { return image_dims_[1]; } - - inline cl_command_queue CommandQueue() const { return command_queue_; } - - /* - * resize original tensor dim - * */ - inline CLImage &Resize(const DDim &dims) { - tensor_dims_ = dims; - return *this; - } - - template - T *data() const { - if (initialized_) { - PADDLE_MOBILE_THROW_EXCEPTION( - " cl image has initialized, tensor data has been deleted, can't use " - "tensor data"); - } - return reinterpret_cast(tensor_data_); - } - - /* - * numel of tensor dim - * */ - inline int64_t numel() const { return product(tensor_dims_); } - - /* - * original tensor dim - * */ - const DDim &dims() const { return tensor_dims_; } - - cl_event GetClEvent() const { return cl_event_.get(); } - - CLImageConverterBase *Converter() const { return image_converter_; } - void PrintTensor(const CLImage &cl_image) const; - - private: - void InitCLImage(cl_context context, size_t width, size_t height, - void *data) { - PADDLE_MOBILE_ENFORCE(!shared_mem_, "do not init mem after shared .") - - cl_image_format cf = {.image_channel_order = CL_RGBA, - .image_channel_data_type = CL_HALF_FLOAT}; - cl_image_desc cid = { - .image_type = CL_MEM_OBJECT_IMAGE2D, - .image_width = width, - .image_height = height, - .image_depth = 1, - .image_array_size = 1, - .image_row_pitch = 0, - .image_slice_pitch = 0, - .num_mip_levels = 0, - .num_samples = 0, - // .buffer = nullptr - }; - cid.buffer = nullptr; - cl_int err; - cl_mem cl_image = clCreateImage( - context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), - &cf, // const cl_image_format *image_format - &cid, // const cl_image_desc *image_desc - data, // void *host_ptr - &err); - cl_image_.reset(cl_image, CLMemDeleter()); - if (err != CL_SUCCESS) { - CL_CHECK_ERRORS(err); - PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); - } - } - - bool initialized_ = false; - std::shared_ptr<_cl_mem> cl_image_; - std::unique_ptr<_cl_event, CLEventDeleter> cl_event_; - DDim tensor_dims_; - DDim image_dims_; - // real image dims usually it is same as image_dims - DDim real_image_dims_; - float *tensor_data_ = nullptr; - cl_context context_; - cl_command_queue command_queue_; - CLImageConverterBase *image_converter_ = nullptr; - bool shared_mem_ = false; -}; - -void TensorToCLImage(Tensor *tensor, CLImage *image, cl_context context, - cl_command_queue commandQueue, cl_kernel kernel); - -void CLImageToTensor(CLImage *image, Tensor *tensor, cl_context context, - cl_command_queue commandQueue, cl_kernel kernel); - -#ifdef PADDLE_MOBILE_DEBUG -Print &operator<<(Print &printer, const CLImage &image); -#endif - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_image_converter.cpp b/mobile/src/framework/cl/cl_image_converter.cpp deleted file mode 100644 index 277d379152..0000000000 --- a/mobile/src/framework/cl/cl_image_converter.cpp +++ /dev/null @@ -1,510 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/cl/cl_image_converter.h" - -namespace paddle_mobile { -namespace framework { - -DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - return make_ddim({width, height}); -} - -void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, - const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - - DDim in_image_dim = InitImageDimInfoWith(tensor_dim); - - DLOG << " tensor dim " << tensor_dim; - DLOG << " image dim " << in_image_dim; - - size_t width = in_image_dim[0]; - size_t height = in_image_dim[1]; - - int w_block = width / W; - - float *p = nchw; - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < w_block * 4; c++) { - size_t i1 = i0 + (c / 4) * W; - for (int h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (int w = 0; w < W; w++) { - if (c < C) { - // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + - // (c % 4); - image[i2] = Float2Half(*p); - i2 += 4; - p++; - } else { - image[i2] = 0.0; - i2 += 4; - } - } - i1 += width; - } - } - i0 += width * H; - } -} - -void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - - int width = image_dim[0]; - int height = image_dim[0]; - - float *p = tensor; - - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0 + (c / 4) * W; - for (int h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (int w = 0; w < W; w++) { - *p = Half2Float(image[i2]); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } -} - -DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { - if (tensor_dim.size() <= 2) { - int tdim[2] = {1, 1}; - if (tensor_dim.size() == 1) { - tdim[1] = tensor_dim[0]; - } else { - tdim[0] = tensor_dim[0]; - tdim[1] = tensor_dim[1]; - } - int width = (tdim[1] + 3) / 4; - int height = tdim[0]; - - width_of_one_block_ = width; - height_of_one_block_ = height; - c_block_ = 1; - - return make_ddim({width, height}); - - } else { - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - width_of_one_block_ = W; - height_of_one_block_ = H; - c_block_ = width / W; - - return make_ddim({width, height}); - } -} - -void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, - const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0, - "tensor dim is not support "); - - if (tensor_dim.size() > 2) { - CLImageConverterDefault default_converter; - default_converter.NCHWToImage(tensor, image, tensor_dim); - - } else { - int tdim[2] = {1, 1}; - if (tensor_dim.size() == 1) { - tdim[1] = tensor_dim[0]; - } else { - tdim[0] = tensor_dim[0]; - tdim[1] = tensor_dim[1]; - } - - DDim image_dim = InitImageDimInfoWith(tensor_dim); - int width = image_dim[0]; - - for (int h = 0; h < tdim[0]; h++) { - for (int w = 0; w < tdim[1]; w++) { - image[(h * width + w / 4) * 4 + (w % 4)] = - Float2Half(tensor[h * tdim[1] + w]); - } - } - } -} - -void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - if (tensor_dim.size() > 2) { - CLImageConverterDefault default_converter; - default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); - - } else { - int width = image_dim[0]; - int height = image_dim[1]; - int H, W; - - if (tensor_dim.size() == 2) { - H = tensor_dim[0]; - W = tensor_dim[1]; - } else if (tensor_dim.size() == 1) { - H = 1; - W = tensor_dim[0]; - } - float *p = tensor; - - for (int h = 0; h < H; h++) { - for (int w = 0; w < W; w++) { - p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); - } - } - } -} - -DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); - size_t N, C, H, W; - N = tensor_dim[0]; - C = tensor_dim[1]; - H = tensor_dim[2]; - W = tensor_dim[3]; - size_t width = W * ((N + 3) / 4); - size_t height = C * H; - return make_ddim({width, height}); -} - -void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, - const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); - auto image_dim = InitImageDimInfoWith(tensor_dim); - float *p = tensor; - int N = tensor_dim[0]; - int C = tensor_dim[1]; - int H = tensor_dim[2]; - int W = tensor_dim[3]; - int width = image_dim[0]; - int height = image_dim[1]; - int block = image_dim[0] / tensor_dim[3]; - - for (int n = 0; n < block * 4; n++) { - for (int c = 0; c < C; c++) { - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + - w * 4 + n % 4; - if (n < N) { - image[index] = Float2Half(*p); - p++; - } else { - image[index] = 0.0; - } - if (index >= (width * height * 4)) { - DLOG << " index out of range "; - } - } - } - } - } - DLOG << " init done"; -} - -void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); - float *p = tensor; - int N = tensor_dim[0]; - int C = tensor_dim[1]; - int H = tensor_dim[2]; - int W = tensor_dim[3]; - int width = image_dim[0]; - int height = image_dim[1]; - int block = image_dim[0] / tensor_dim[3]; - - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - for (int h = 0; h < H; ++h) { - for (int w = 0; w < W; ++w) { - int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + - w * 4 + n % 4; - *p = Half2Float(image[index]); - p++; - if (index >= (width * height * 4)) { - DLOG << " index out of range "; - } - } - } - } - } - DLOG << " init done"; -} - -DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); - size_t N, C, H, W; - N = tensor_dim[0]; - C = tensor_dim[1]; - H = tensor_dim[2]; - W = tensor_dim[3]; - size_t width = W * ((N + 3) / 4); - size_t height = C * H; - return make_ddim({width, height}); -} - -void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, - const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - - size_t N, C, H, W; - N = new_dims[1]; - C = new_dims[0]; - H = new_dims[2]; - W = new_dims[3]; - - DDim in_image_dim = InitImageDimInfoWith(tensor_dim); - - DLOG << " tensor dim " << tensor_dim; - DLOG << " image dim " << in_image_dim; - - size_t width = in_image_dim[0]; - size_t height = in_image_dim[1]; - - int w_block = width / W; - - float *p = tensor; - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < w_block * 4; c++) { - size_t i1 = i0 + (c / 4) * W; - for (int h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (int w = 0; w < W; w++) { - if (c < C) { - // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + - // (c % 4); - image[i2] = Float2Half(*p); - i2 += 4; - p++; - } else { - image[i2] = 0.0; - i2 += 4; - } - } - i1 += width; - } - } - i0 += width * H; - } -} - -void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); - float *p = tensor; - int N = tensor_dim[1]; - int C = tensor_dim[0]; - int H = tensor_dim[2]; - int W = tensor_dim[3]; - int width = image_dim[0]; - int height = image_dim[0]; - - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0 + (c / 4) * W; - for (int h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (int w = 0; w < W; w++) { - *p = Half2Float(image[i2]); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } -} - -DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0, - "tensor dim is not support "); - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - width_of_one_block_ = W; - height_of_one_block_ = H; - c_block_ = width / W; - - return make_ddim({width, height}); -} - -void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image, - const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0, - "tensor dim is not support "); - - CLImageConverterDefault default_converter; - default_converter.NCHWToImage(tensor, image, tensor_dim); -} - -void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - CLImageConverterDefault default_converter; - default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); -} - -DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( - const DDim &tensor_dim) { - PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); - size_t N, C, H, W; - N = tensor_dim[0]; - C = tensor_dim[1]; - H = tensor_dim[2]; - W = tensor_dim[3]; - size_t width = (C + 3) / 4; - size_t height = N * 16; // N * (wino_blk_size + 2) * (wino_blk_size + 2) - return make_ddim({width, height}); -} - -void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image, - const DDim &tensor_dim) {} - -void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) {} - -DDim CLImageConverterConv2dTransposeTransWeight::InitImageDimInfoWith( - const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - size_t N, C, H, W; - C = new_dims[0]; - N = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - return make_ddim({width, height}); -} - -// it is actually CNHW to Image, because conv2d_transpose's filter is CNHW -void CLImageConverterConv2dTransposeTransWeight::NCHWToImage( - float *nchw, half_t *image, const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (int j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - - size_t N, C, H, W; - C = new_dims[0]; - N = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - - DDim in_image_dim = InitImageDimInfoWith(tensor_dim); - - DLOG << " tensor dim " << tensor_dim; - DLOG << " image dim " << in_image_dim; - - size_t width = in_image_dim[0]; - size_t height = in_image_dim[1]; - - int w_block = width / W; - - float *p = nchw; - int realC = w_block * 4; - for (int c = 0; c < realC; c++) { - for (int n = 0; n < N; n++) { - for (int h = 0; h < H; h++) { - for (int w = 0; w < W; w++) { - int index = (n * H + h) * width * 4 + (c / 4) * 4 * W + w * 4 + c % 4; - if (c < C) { - image[index] = Float2Half(*p); - p++; - } else { - image[index] = 0; - } - } - } - } - } -} - -void CLImageConverterConv2dTransposeTransWeight::ImageToNCHW( - half_t *image, float *tensor, const DDim &image_dim, - const DDim &tensor_dim) {} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_image_converter.h b/mobile/src/framework/cl/cl_image_converter.h deleted file mode 100644 index 75c135c042..0000000000 --- a/mobile/src/framework/cl/cl_image_converter.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/cl/cl_half.h" -#include "framework/ddim.h" - -namespace paddle_mobile { -namespace framework { - -class CLImageConverterBase { - public: - virtual void NCHWToImage(float *nchw, half_t *image, - const DDim &tensor_dim) = 0; - - virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, - const DDim &tensor_dim) = 0; - virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0; -}; - -class CLImageConverterDefault : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, - const DDim &tensor_dim); -}; - -class CLImageConverterFolder : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, - const DDim &tensor_dim); - - /* - * width of original tensor - * */ - inline size_t WidthOfOneBlock() const { return width_of_one_block_; } - - /* - * height of original tensor - * */ - inline size_t HeightOfOneBlock() const { return height_of_one_block_; } - - int GetCBlock() const { return c_block_; } - - private: - int c_block_; - int width_of_one_block_; - int height_of_one_block_; -}; - -class CLImageConverterNormal : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, - const DDim &tensor_dim); - - /* - * width of original tensor - * */ - inline size_t WidthOfOneBlock() const { return width_of_one_block_; } - - /* - * height of original tensor - * */ - inline size_t HeightOfOneBlock() const { return height_of_one_block_; } - - int GetCBlock() const { return c_block_; } - - private: - int c_block_; - int width_of_one_block_; - int height_of_one_block_; -}; - -class CLImageConverterNWBlock : public CLImageConverterBase { - DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, - const DDim &tensor_dim); -}; -class CLImageConverterDWBlock : public CLImageConverterBase { - DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, - const DDim &tensor_dim); -}; - -class CLImageConverterWinoTransWeight : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, - const DDim &tensor_dim); -}; - -class CLImageConverterConv2dTransposeTransWeight : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim); - void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); - void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, - const DDim &tensor_dim); -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h deleted file mode 100644 index 49e705e5a0..0000000000 --- a/mobile/src/framework/cl/cl_scope.h +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "CL/cl.h" -#include "framework/cl/cl_deleter.h" -#include "framework/cl/cl_engine.h" -#include "framework/cl/cl_tool.h" - -namespace paddle_mobile { - -extern const std::map> opencl_kernels; -extern const std::map> opencl_headers; - -namespace framework { - -class CLScope { - public: - CLScope() {} - - cl_command_queue CommandQueue() { - return CLEngine::Instance()->getClCommandQueue(); - } - - std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel( - const std::string &kernel_name, const std::string &file_name, - const std::string &options) { - LOG(kLOG_DEBUG2) << " to get program " << file_name; - auto program = Program(file_name, kernel_name, options); - LOG(kLOG_DEBUG2) << " end get program ~ "; - LOG(kLOG_DEBUG2) << " to create kernel: " << kernel_name; - std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( - clCreateKernel(program, kernel_name.c_str(), &status_)); - CL_CHECK_ERRORS(status_); - LOG(kLOG_DEBUG2) << " end create kernel ~ "; - return std::move(kernel); - } - - cl_context Context() { return CLEngine::Instance()->getContext(); } - - cl_program Program(const std::string &file_name, - const std::string &kernel_name, - const std::string &options) { - if (opencl_kernels.find(kernel_name) != opencl_kernels.end() && - opencl_headers.find(file_name) != opencl_headers.end()) { - std::string program_key = file_name + kernel_name; - if (!options.empty()) { - program_key += options; - } - auto it = programs_.find(program_key); - if (it != programs_.end()) { - return it->second.get(); - } - auto src_it = opencl_kernels.find(kernel_name); - std::string source(src_it->second.begin(), src_it->second.end()); - auto header_it = opencl_headers.find(file_name); - std::string header(header_it->second.begin(), header_it->second.end()); - source = header + "\n" + source; - auto program = CLEngine::Instance()->CreateProgramWithSource( - CLEngine::Instance()->getContext(), source.c_str()); - - LOG(kLOG_DEBUG3) << " --- begin build program -> " << program_key - << " --- "; - CLEngine::Instance()->BuildProgram(program.get(), options); - LOG(kLOG_DEBUG3) << " --- end build program -> " << program_key - << " --- "; - - programs_[program_key] = std::move(program); - return programs_[program_key].get(); - } else { - std::string program_key = file_name; - if (!options.empty()) { - program_key += options; - } - auto it = programs_.find(program_key); - if (it != programs_.end()) { - return it->second.get(); - } - auto program = CLEngine::Instance()->CreateProgramWith( - CLEngine::Instance()->getContext(), - CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name); - - LOG(kLOG_DEBUG3) << " --- begin build program ele-> " << program_key - << " --- "; - CLEngine::Instance()->BuildProgram(program.get(), options); - LOG(kLOG_DEBUG3) << " --- end build program ele-> " << program_key - << " --- "; - - programs_[program_key] = std::move(program); - return programs_[program_key].get(); - } - } - - CLLocalWorkSizeInfo LocalWorkSizeInfo() { - return CLEngine::Instance()->getLocalWorkSizeInfo(); - } - size_t KernelWorkSize(cl_kernel kernel) { - size_t kernel_work_size = CLEngine::Instance()->GetKernelWorkSize(kernel); - return kernel_work_size; - } - - private: - cl_int status_; - std::unordered_map> - programs_; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_tensor.h b/mobile/src/framework/cl/cl_tensor.h deleted file mode 100644 index 5bb4055eff..0000000000 --- a/mobile/src/framework/cl/cl_tensor.h +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "CL/cl.h" -#include "framework/cl/cl_deleter.h" -#include "framework/cl/cl_engine.h" -#include "framework/tensor_base.h" - -namespace paddle_mobile { -namespace framework { - -class CLTensor : public TensorBase { - public: - CLTensor(cl_context context, cl_command_queue command_queue) - : context_(context), command_queue_(command_queue) {} - - CLTensor() = default; - - /* - * if init method haven't set context and command_queue, need set - * */ - void SetContextAndCommandQueue(cl_context context, - cl_command_queue command_queue) { - context_ = context; - command_queue_ = command_queue; - } - - /*! Resize the dimensions of the memory block. */ - inline CLTensor &Resize(const DDim &dims) { - dims_ = dims; - return *this; - } - - template - inline cl_mem mutable_with_data(const T *data) { - int64_t size = numel() * sizeof(T); - - holder_.reset(new PlaceholderImpl( - size, reinterpret_cast(const_cast(data)), - type_id().hash_code(), context_, command_queue_)); - return reinterpret_cast(holder_->ptr()); - } - - inline cl_mem mutable_data(kTypeId_t type) { - if (holder_ != nullptr) { - holder_->set_type(type); - } - PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") - int64_t size = numel() * SizeOfType(type); - if (holder_ == nullptr || holder_->size() < size + offset_) { - holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_)); - offset_ = 0; - } - return reinterpret_cast(holder_->ptr()); - } - - /** - * @brief Return a pointer to cl buffer. - * @note If not exist, then allocation. - */ - template - inline cl_mem mutable_data() { - return reinterpret_cast(mutable_data(type_id().hash_code())); - } - - /** - * @brief Return a pointer to cl buffer. - * - * @param[in] dims The dimensions of the memory block. - * @param[in] place The place of the memory block. - * - * @note If not exist, then allocation. - */ - template - inline cl_mem mutable_data(DDim dims) { - Resize(dims); - return mutable_data(); - } - - inline cl_mem CLBuffer() { - check_memory_size(); - return reinterpret_cast( - reinterpret_cast(holder_->ptr())); - } - - template - inline T *Data() { - if (host_ptr_) { - delete (host_ptr_); - host_ptr_ = nullptr; - } - cl_mem buffer = CLBuffer(); - host_ptr_ = new char[holder_->size()]; - cl_int status; - status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0, - holder_->size(), host_ptr_, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - return reinterpret_cast(host_ptr_); - } - - int memorySize() { return holder_->size(); } - - ~CLTensor() { - DLOG << "~CLTensor"; - if (host_ptr_) { - DLOG << " delete host ptr "; - delete (host_ptr_); - host_ptr_ = nullptr; - } - } - - private: - cl_context context_; - cl_command_queue command_queue_; - void *host_ptr_ = nullptr; - - struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(size_t size, void *input, kTypeId_t type, - cl_context context, cl_command_queue command_queue) - : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, - size, reinterpret_cast(input), NULL)), - size_(size), - capatity_(size), - type_(type), - context_(context), - command_queue_(command_queue) {} - - PlaceholderImpl(size_t size, kTypeId_t type, cl_context context, - cl_command_queue command_queue) - : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)), - size_(size), - capatity_(size), - type_(type), - context_(context), - command_queue_(command_queue) {} - - virtual size_t size() const { return size_; } - - virtual void *ptr() const { return static_cast(ptr_.get()); } - - virtual kTypeId_t type() const { return type_; } - - virtual void set_type(kTypeId_t type) { type_ = type; } - - virtual void resize(size_t size) { - if (size > capatity_) { - capatity_ = size; - ptr_.reset( - clCreateBuffer(context_, CL_MEM_READ_WRITE, capatity_, NULL, NULL)); - } - size_ = size; - } - - virtual void realloc(size_t size) { - capatity_ = size; - ptr_.reset( - clCreateBuffer(context_, CL_MEM_READ_WRITE, capatity_, NULL, NULL)); - size_ = size; - } - - std::unique_ptr<_cl_mem, CLMemDeleter> ptr_; - - size_t size_; - - size_t capatity_; - - /* the current type of memory */ - kTypeId_t type_; - - cl_context context_; - cl_command_queue command_queue_; - }; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_tool.cpp b/mobile/src/framework/cl/cl_tool.cpp deleted file mode 100644 index 827642b6b7..0000000000 --- a/mobile/src/framework/cl/cl_tool.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/cl/cl_tool.h" - -namespace paddle_mobile { -namespace framework { - -const char *opencl_error_to_str(cl_int error) { -#define CASE_CL_CONSTANT(NAME) \ - case NAME: \ - return #NAME; - // Suppose that no combinations are possible. - switch (error) { - CASE_CL_CONSTANT(CL_SUCCESS) - CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND) - CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE) - CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE) - CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE) - CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES) - CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY) - CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE) - CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP) - CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH) - CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED) - CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE) - CASE_CL_CONSTANT(CL_MAP_FAILURE) - CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET) - CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) - CASE_CL_CONSTANT(CL_INVALID_VALUE) - CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE) - CASE_CL_CONSTANT(CL_INVALID_PLATFORM) - CASE_CL_CONSTANT(CL_INVALID_DEVICE) - CASE_CL_CONSTANT(CL_INVALID_CONTEXT) - CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES) - CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE) - CASE_CL_CONSTANT(CL_INVALID_HOST_PTR) - CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT) - CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) - CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE) - CASE_CL_CONSTANT(CL_INVALID_SAMPLER) - CASE_CL_CONSTANT(CL_INVALID_BINARY) - CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS) - CASE_CL_CONSTANT(CL_INVALID_PROGRAM) - CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE) - CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME) - CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION) - CASE_CL_CONSTANT(CL_INVALID_KERNEL) - CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX) - CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE) - CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE) - CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS) - CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION) - CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE) - CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE) - CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET) - CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST) - CASE_CL_CONSTANT(CL_INVALID_EVENT) - CASE_CL_CONSTANT(CL_INVALID_OPERATION) - CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT) - CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE) - CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL) - CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE) - CASE_CL_CONSTANT(CL_INVALID_PROPERTY) - - default: - return "UNKNOWN ERROR CODE"; - } -#undef CASE_CL_CONSTANT -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_tool.h b/mobile/src/framework/cl/cl_tool.h deleted file mode 100644 index ccc97779ec..0000000000 --- a/mobile/src/framework/cl/cl_tool.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "CL/cl.h" - -namespace paddle_mobile { -namespace framework { - -const char* opencl_error_to_str(cl_int error); - -#define CL_CHECK_ERRORS(ERR) \ - if (ERR != CL_SUCCESS) { \ - printf( \ - "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \ - "%d. " \ - "Exiting.\033[0m\n", \ - paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ - __LINE__); \ - } - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/context.cpp b/mobile/src/framework/context.cpp deleted file mode 100644 index 10f1572d03..0000000000 --- a/mobile/src/framework/context.cpp +++ /dev/null @@ -1,605 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// Tencent is pleased to support the open source community by making ncnn -// available. -// -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of the -// License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// License for the specific language governing permissions and limitations under -// the License. - -#include "framework/context.h" -#include -#include -#include "common/log.h" - -#ifdef __APPLE__ -#include "TargetConditionals.h" -#ifdef TARGET_OS_IPHONE -// iOS -#elif TARGET_OS_MAC -// Mac OS -#else -// Unsupported platform -#endif -#include -#include -#include -#else // Linux or Android -#include -#include -#endif - -namespace paddle_mobile { -namespace framework { - -const int DEFAULT_L1_CACHE_SIZE = 32 * 1024; -const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; -const int DEFAULT_L3_CACHE_SIZE = 0; - -void fill_cpu_cache_size(std::vector *cpu_cache_sizes, int value, - const std::vector cpu_ids = {}) { - int num = cpu_ids.size(); - if (num > 0) { - for (int i = 0; i < num; i++) { - if (cpu_ids.size() > i) { - int idx = cpu_ids[i]; - if (cpu_cache_sizes->size() > idx) { - (*cpu_cache_sizes)[idx] = value; - } - } - } - } else { - num = cpu_cache_sizes->size(); - for (int i = 0; i < num; i++) { - if (cpu_cache_sizes->size() > i) { - (*cpu_cache_sizes)[i] = value; - } - } - } -} - -int get_cpu_num() { -#ifdef __APPLE__ - int count = 0; - size_t len = sizeof(count); - sysctlbyname("hw.ncpu", &count, &len, NULL, 0); - if (count < 1) { - count = 1; - } - return count; -#else // Linux or Android - // get cpu num from /sys/devices/system/cpu/cpunum/uevent - int max_cpu_num = 20; - int count = 0; - for (int i = 0; i < max_cpu_num; i++) { - char path[256]; - snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); - FILE *fp = fopen(path, "rb"); - if (!fp) { - break; - } - count++; - fclose(fp); - } - if (count < 1) { - count = 1; - } - return count; -#endif -} - -#if !defined(__APPLE__) // Linux or Android -std::string get_cpu_name() { - FILE *fp = fopen("/proc/cpuinfo", "rb"); - if (!fp) { - return ""; - } - char line[1024]; - while (!feof(fp)) { - char *s = fgets(line, 1024, fp); - if (!s) { - break; - } - if (strstr(line, "Hardware") != NULL) { - fclose(fp); - return std::string(line); - } - } - fclose(fp); - return ""; -} - -int get_cpu_max_freq_khz(int cpu_id) { - // first try, for all possible cpu - char path[256]; -#ifdef __ANDROID__ - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id); - FILE *fp = fopen(path, "rb"); - if (!fp) { - // second try, for online cpu - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", - cpu_id); - fp = fopen(path, "rb"); - if (!fp) { - // third try, for online cpu - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", - cpu_id); - fp = fopen(path, "rb"); - if (!fp) { - return 0; - } - int max_freq_khz = 0; - if (fscanf(fp, "%d", &max_freq_khz) <= 0) { - max_freq_khz = 0; - } - fclose(fp); - return max_freq_khz; - } - } - int max_freq_khz = 0; - while (!feof(fp)) { - int freq_khz = 0; - int nscan = fscanf(fp, "%d %*d", &freq_khz); - if (nscan != 1) { - break; - } - if (freq_khz > max_freq_khz) { - max_freq_khz = freq_khz; - } - } - fclose(fp); - return max_freq_khz; -#else - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", cpu_id); - FILE *fp = fopen(path, "r"); - if (!fp) { - return 0; - } - int max_freq_khz = 0; - if (fscanf(fp, "%d", &max_freq_khz) <= 0) { - max_freq_khz = 0; - } - fclose(fp); - return max_freq_khz; -#endif -} - -void get_cpu_cache_size(int cpu_id, int *l1_cache_size, int *l2_cache_size, - int *l3_cache_size) { - int max_cache_idx_num = 10; - *l1_cache_size = DEFAULT_L1_CACHE_SIZE; - *l2_cache_size = DEFAULT_L2_CACHE_SIZE; - *l3_cache_size = DEFAULT_L3_CACHE_SIZE; - for (int i = 0; i < max_cache_idx_num; i++) { - char path[256]; - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i); - FILE *fp = fopen(path, "rb"); - if (fp) { - int level = -1; - fscanf(fp, "%d", &level); - fclose(fp); - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i); - fp = fopen(path, "rb"); - if (fp) { - int size = -1; - fscanf(fp, "%d", &size); - fclose(fp); - if (size >= 0) { - if (level == 1) { - *l1_cache_size = size * 1024; - } else if (level == 2) { - *l2_cache_size = size * 1024; - } else if (level == 3) { - *l3_cache_size = size * 1024; - } - } - } - } - } -} - -int check_online(std::vector *cpu_ids) { - if (cpu_ids->size() == 0) { - return 0; - } - std::vector online_cpu_ids; - char path[256]; - for (int i = 0; i < cpu_ids->size(); i++) { - int cpu_id = (*cpu_ids)[i]; - snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online", - cpu_id); - FILE *fp = fopen(path, "rb"); - if (fp) { - int is_online = 0; - fscanf(fp, "%d", &is_online); - fclose(fp); - if (is_online != 0) { - online_cpu_ids.push_back(cpu_id); - } - } - // open failed(Permission denied) - } - *cpu_ids = online_cpu_ids; - return cpu_ids->size(); -} - -int set_sched_affinity(const std::vector &cpu_ids) { -// cpu_set_t definition -// ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity -#define CPU_SETSIZE 1024 -#define __NCPUBITS (8 * sizeof(unsigned long)) // NOLINT - typedef struct { - unsigned long __bits[CPU_SETSIZE / __NCPUBITS]; // NOLINT - } cpu_set_t; - -#define CPU_SET(cpu, cpusetp) \ - ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) - -#define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t)) - - // set affinity for thread -#ifdef __GLIBC__ - pid_t pid = syscall(SYS_gettid); -#else - pid_t pid = gettid(); -#endif - cpu_set_t mask; - CPU_ZERO(&mask); - for (int i = 0; i < cpu_ids.size(); i++) { - CPU_SET(cpu_ids[i], &mask); - } - int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); - if (syscallret) { - LOG(kLOG_WARNING) << "invoke syscall(__NR_sched_setaffinity) error(ret=" - << syscallret << ")"; - return -1; - } - return 0; -} - -int get_cpu_info_by_name(int *cpu_num, ARMArch *arch, - std::vector *big_core_ids, - std::vector *little_core_ids, - std::vector *l1_cache_sizes, - std::vector *l2_cache_sizes, - std::vector *l3_cache_sizes, - std::string hardware_name) { - /* Snapdragon */ - if (hardware_name.find("SDM845") != std::string::npos) { // 845 - *cpu_num = 8; - *arch = A75; - *big_core_ids = {4, 5, 6, 7}; - *little_core_ids = {0, 1, 2, 3}; - l1_cache_sizes->resize(*cpu_num); - l2_cache_sizes->resize(*cpu_num); - l3_cache_sizes->resize(*cpu_num); - fill_cpu_cache_size(l1_cache_sizes, 64 * 1024); - fill_cpu_cache_size(l2_cache_sizes, 256 * 1024, *big_core_ids); - fill_cpu_cache_size(l2_cache_sizes, 128 * 1024, *little_core_ids); - fill_cpu_cache_size(l3_cache_sizes, 2048 * 1024); - return 0; - } else if (hardware_name.find("SDM710") != std::string::npos) { // 710 - *cpu_num = 8; - *arch = A75; - *big_core_ids = {6, 7}; - *little_core_ids = {0, 1, 2, 3, 4, 5}; - l1_cache_sizes->resize(*cpu_num); - l2_cache_sizes->resize(*cpu_num); - l3_cache_sizes->resize(*cpu_num); - fill_cpu_cache_size(l1_cache_sizes, 64 * 1024, *big_core_ids); - fill_cpu_cache_size(l1_cache_sizes, 32 * 1024, *little_core_ids); - fill_cpu_cache_size(l2_cache_sizes, 256 * 1024, *big_core_ids); - fill_cpu_cache_size(l2_cache_sizes, 128 * 1024, *little_core_ids); - fill_cpu_cache_size(l3_cache_sizes, 1024 * 1024); - return 0; - } else if (hardware_name.find("MSM8998") != std::string::npos) { // 835 - *cpu_num = 8; - *arch = A73; - *big_core_ids = {4, 5, 6, 7}; - *little_core_ids = {0, 1, 2, 3}; - l1_cache_sizes->resize(*cpu_num); - l2_cache_sizes->resize(*cpu_num); - l3_cache_sizes->resize(*cpu_num); - fill_cpu_cache_size(l1_cache_sizes, 64 * 1024, *big_core_ids); - fill_cpu_cache_size(l1_cache_sizes, 32 * 1024, *little_core_ids); - // real L2 cache size is 2M, while that will get bad performace on conv3x3s1 - // or gemm, set to 1M or 512K - // fill_cpu_cache_size(l2_cache_sizes, 2048 *1024, - // *big_core_ids); - // fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024, - // *little_core_ids); - fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024); - fill_cpu_cache_size(l3_cache_sizes, 0); - return 0; - } else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653 - *cpu_num = 8; - *arch = A72; - *big_core_ids = {4, 5, 6, 7}; - *little_core_ids = {0, 1, 2, 3}; - l1_cache_sizes->resize(*cpu_num); - l2_cache_sizes->resize(*cpu_num); - l3_cache_sizes->resize(*cpu_num); - fill_cpu_cache_size(l1_cache_sizes, 32 * 1024); - fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024); - fill_cpu_cache_size(l3_cache_sizes, 0); - return 0; - } else if (hardware_name.find("SDM660") != std::string::npos || - hardware_name.find("SDM636") != std::string::npos) { // 660, 636 - *cpu_num = 8; - *arch = A73; - *big_core_ids = {4, 5, 6, 7}; - *little_core_ids = {0, 1, 2, 3}; - l1_cache_sizes->resize(*cpu_num); - l2_cache_sizes->resize(*cpu_num); - l3_cache_sizes->resize(*cpu_num); - fill_cpu_cache_size(l1_cache_sizes, 64 * 1024); - fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024); - fill_cpu_cache_size(l3_cache_sizes, 0); - return 0; - - /* MediaTek */ - } else if (hardware_name.find("MT6799") != std::string::npos) { // X30 - *cpu_num = 10; - *arch = A73; - *big_core_ids = {8, 9}; - *little_core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; - return 0; - } else if (hardware_name.find("MT6771") != std::string::npos) { // P60 - *cpu_num = 8; - *arch = A73; - *big_core_ids = {4, 5, 6, 7}; - *little_core_ids = {0, 1, 2, 3}; - return 0; - - /* Kirin */ - } else if (hardware_name.find("KIRIN970") != - std::string::npos) { // Kirin 970 - *cpu_num = 8; - *arch = A73; - *big_core_ids = {4, 5, 6, 7}; - *little_core_ids = {0, 1, 2, 3}; - return 0; - } - return -1; -} - -// divide cpu cores into big and little clusters by max frequency -void get_cpu_info_by_probe(int cpu_num, std::vector *big_core_ids, - std::vector *little_core_ids, - std::vector *l1_cache_sizes, - std::vector *l2_cache_sizes, - std::vector *l3_cache_sizes) { - // get maxium & minium of cpu_max_freqs - std::vector cpu_max_freqs(cpu_num); - for (int i = 0; i < cpu_num; i++) { - cpu_max_freqs[i] = get_cpu_max_freq_khz(i) / 1000; - } - int max_cpu_max_freq = cpu_max_freqs[0]; - int min_cpu_max_freq = cpu_max_freqs[0]; - for (int i = 1; i < cpu_num; i++) { - int cur_cpu_max_freq = cpu_max_freqs[i]; - if (cur_cpu_max_freq < min_cpu_max_freq) { - min_cpu_max_freq = cur_cpu_max_freq; - } else if (cur_cpu_max_freq > max_cpu_max_freq) { - max_cpu_max_freq = cur_cpu_max_freq; - } - } - int mid_max_freq_khz = (max_cpu_max_freq + min_cpu_max_freq) / 2; - big_core_ids->clear(); - little_core_ids->clear(); - for (int i = 0; i < cpu_num; i++) { - if (cpu_max_freqs[i] >= mid_max_freq_khz) { - big_core_ids->push_back(i); - } else { - little_core_ids->push_back(i); - } - } - /* get l1, l2, l3 cache size for each core */ - l1_cache_sizes->resize(cpu_num); - l2_cache_sizes->resize(cpu_num); - l3_cache_sizes->resize(cpu_num); - for (int i = 0; i < cpu_num; i++) { - get_cpu_cache_size(i, &((*l1_cache_sizes)[i]), &((*l2_cache_sizes)[i]), - &((*l3_cache_sizes)[i])); - } -} - -void bind_threads(const std::vector &cpu_ids) { -#ifdef _OPENMP - int num_threads = omp_get_max_threads(); - std::vector ssarets; - for (int i = 0; i < num_threads; i++) { - ssarets.push_back(0); - } -#pragma omp parallel for - for (int i = 0; i < num_threads; i++) { - ssarets[i] = set_sched_affinity(cpu_ids); - } - for (int i = 0; i < num_threads; i++) { - if (ssarets[i] != 0) { - LOG(kLOG_WARNING) << "set cpu affinity failed, thread idx: " << i; - return; - } - } -#else - int ssaret = set_sched_affinity(cpu_ids); - if (ssaret != 0) { - LOG(kLOG_WARNING) << "set cpu affinity failed, thread idx: 0 "; - return; - } -#endif -} -#endif - -CPUContext::CPUContext() { - _cpu_num = get_cpu_num(); - _big_core_ids.clear(); - _little_core_ids.clear(); -#ifdef __APPLE__ - // set default L1, L2 and L3 cache sizes - _l1_cache_sizes.resize(_cpu_num); - _l2_cache_sizes.resize(_cpu_num); - _l3_cache_sizes.resize(_cpu_num); - fill_cpu_cache_size(&_l1_cache_sizes, DEFAULT_L1_CACHE_SIZE); - fill_cpu_cache_size(&_l2_cache_sizes, DEFAULT_L2_CACHE_SIZE); - fill_cpu_cache_size(&_l3_cache_sizes, DEFAULT_L3_CACHE_SIZE); -#else // Linux or Android - // probe cpu info, and set big&litte clusters, L1, L2 and L3 cache sizes - std::string cpu_name = get_cpu_name(); - bool failed = - get_cpu_info_by_name(&_cpu_num, &_arch, &_big_core_ids, &_little_core_ids, - &_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes, - cpu_name) != 0; - if (failed) { - get_cpu_info_by_probe(_cpu_num, &_big_core_ids, &_little_core_ids, - &_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes); - } - LOG(kLOG_INFO) << "CPU num: " << _cpu_num; - for (int i = 0; i < _cpu_num; i++) { - if (!(_l1_cache_sizes.size() > i && _l2_cache_sizes.size() > i && - _l3_cache_sizes.size() > i)) { - break; - } - LOG(kLOG_INFO) << i << " L1 Cache: " << _l1_cache_sizes[i] << "KB" - << " L2 Cache: " << _l2_cache_sizes[i] << "KB" - << " L3 Cache: " << _l3_cache_sizes[i] << "KB"; - } - LOG(kLOG_INFO) << "Big cores: "; - for (int i = 0; i < _big_core_ids.size(); i++) { - LOG(kLOG_INFO) << _big_core_ids[i]; - } - LOG(kLOG_INFO) << "Little cores: "; - for (int i = 0; i < _little_core_ids.size(); i++) { - LOG(kLOG_INFO) << _little_core_ids[i]; - } -#endif - // use single thread by default - set_thread_num(1, PERFORMANCE_PRIORITY); -} - -void CPUContext::set_thread_num(int thread_num, PowerMode power_mode) { - int big_core_num = _big_core_ids.size(); - int little_core_num = _little_core_ids.size(); -#ifdef _OPENMP - if (thread_num > _cpu_num) { - thread_num = _cpu_num; - } -#else - thread_num = 1; -#endif - std::vector bind_core_ids; - if (power_mode == PERFORMANCE_PRIORITY || power_mode == PERFORMANCE_ONLY) { - if (big_core_num > 0) { - bind_core_ids = _big_core_ids; - if (power_mode == PERFORMANCE_ONLY && thread_num > big_core_num) { - LOG(kLOG_ERROR) << "thread_num(" << thread_num - << ") exceed the big cores num (" << big_core_num << ")" - << ", force to set thread_num = " << big_core_num; - thread_num = big_core_num; - } - } - } else if (power_mode == EFFICIENCY_PRIORITY || - power_mode == EFFICIENCY_ONLY) { - if (little_core_num > 0) { - bind_core_ids = _little_core_ids; - if (power_mode == EFFICIENCY_ONLY && thread_num > little_core_num) { - LOG(kLOG_ERROR) << "thread_num(" << thread_num - << ") exceed the little cores num (" << little_core_num - << ")" - << ", force to set thread_num = " << little_core_num; - thread_num = little_core_num; - } - } - } - _power_mode = AUTO; -#ifdef _OPENMP - omp_set_num_threads(thread_num); - thread_num = omp_get_max_threads(); -#endif -#if !defined(__APPLE__) // Linux or Android - if (bind_core_ids.size() > 0 && check_online(&bind_core_ids) >= thread_num) { - bind_threads(bind_core_ids); - _power_mode = power_mode; - } -#endif - LOG(kLOG_INFO) << "thread num: " << thread_num - << " power mode: " << _power_mode; -} - -int CPUContext::get_thread_num() { - int thread_num = 1; -#ifdef _OPENMP - thread_num = omp_get_max_threads(); -#endif - return thread_num; -} - -int CPUContext::get_cache_size(int level) { - std::vector *ptr = nullptr; - if (level == 1) { - ptr = &_l1_cache_sizes; - } else if (level == 2) { - ptr = &_l2_cache_sizes; - } else if (level == 3) { - ptr = &_l3_cache_sizes; - } else { - return 0; - } - if (_power_mode == PERFORMANCE_PRIORITY || _power_mode == PERFORMANCE_ONLY) { - if (_big_core_ids.size() > 0) { - int idx = _big_core_ids[0]; - if (ptr->size() > idx) { - return (*ptr)[idx]; - } - } - } else if (_power_mode == EFFICIENCY_PRIORITY || - _power_mode == EFFICIENCY_ONLY) { - if (_little_core_ids.size() > 0) { - int idx = _little_core_ids[0]; - if (ptr->size() > idx) { - return (*ptr)[idx]; - } - } - } else { // AUTO - int idx = 0; - if (ptr->size() > idx) { - return (*ptr)[idx]; - } - } -} - -void *CPUContext::get_work_space(int size_in_byte) { - return reinterpret_cast( - _workspace.mutable_data(make_ddim({size_in_byte}))); -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/context.h b/mobile/src/framework/context.h deleted file mode 100644 index 18e40311bc..0000000000 --- a/mobile/src/framework/context.h +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// Tencent is pleased to support the open source community by making ncnn -// available. -// -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of the -// License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// License for the specific language governing permissions and limitations under -// the License. - -#pragma once - -#if _OPENMP -#include -#endif - -#include -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace framework { - -struct CPUContext { - private: - CPUContext(); - - public: - ~CPUContext() {} - - static CPUContext* Context() { - static CPUContext ctx; - return &ctx; - } - - void set_thread_num(int thread_num, - PowerMode power_mode = PERFORMANCE_PRIORITY); - int get_thread_num(); - PowerMode get_power_mode() const { return _power_mode; } - int get_cache_size(int level); - ARMArch get_arch() const { return _arch; } - int get_l1_cache_size() { return get_cache_size(1); } - int get_l2_cache_size() { return get_cache_size(2); } - int get_l3_cache_size() { return get_cache_size(3); } - void* get_work_space(int size_in_byte); - - int _cpu_num; - ARMArch _arch; - PowerMode _power_mode; - std::vector _big_core_ids; - std::vector _little_core_ids; - std::vector _l1_cache_sizes; - std::vector _l2_cache_sizes; - std::vector _l3_cache_sizes; - Tensor _workspace; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/data_layout.h b/mobile/src/framework/data_layout.h deleted file mode 100644 index fd0bec3913..0000000000 --- a/mobile/src/framework/data_layout.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -namespace paddle_mobile { -namespace framework { - -enum class DataLayout { - kNHWC = 0, - kNCHW = 1, - kAnyLayout = 2, -}; - -inline DataLayout StringToDataLayout(const std::string &str) { - std::string s(str); - for (size_t i = 0; i < s.size(); ++i) { - s[i] = toupper(s[i]); - } - - if (s == "NHWC") { - return DataLayout::kNHWC; - } else if (s == "NCHW") { - return DataLayout::kNCHW; - } else if (s == "ANYLAYOUT") { - return DataLayout::kAnyLayout; - } else { - PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str()) - } - return DataLayout::kNCHW; -} - -inline std::string DataLayoutToString(const DataLayout &data_layout) { - switch (data_layout) { - case DataLayout::kNHWC: - return "NHWC"; - case DataLayout::kNCHW: - return "NCHW"; - case DataLayout::kAnyLayout: - return "ANY_LAYOUT"; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ") - break; - } -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/data_type.cpp b/mobile/src/framework/data_type.cpp deleted file mode 100644 index 5eaf3ecaf5..0000000000 --- a/mobile/src/framework/data_type.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/data_type.h" -#include -#include -#include -#include "common/type_define.h" - -namespace paddle_mobile { -namespace framework { - -struct DataTypeMap { - std::unordered_map - cpp_to_proto_; - std::unordered_map proto_to_cpp_; - std::unordered_map proto_to_str_; - std::unordered_map cpp_to_size_; -}; - -static DataTypeMap* InitDataTypeMap(); -// C++11 removes the need for manual locking. Concurrent execution shall wait if -// a static local variable is already being initialized. -// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex -static DataTypeMap& gDataTypeMap() { - static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); - return *g_data_type_map_; -} - -template -static inline void RegisterType( - DataTypeMap* map, _PaddleMobile__Framework__Proto__VarType__Type proto_type, - const std::string& name) { - map->proto_to_cpp_.emplace(static_cast(proto_type), - type_id().hash_code()); - map->cpp_to_proto_.emplace(type_id().hash_code(), proto_type); - map->proto_to_str_.emplace(static_cast(proto_type), name); - map->cpp_to_size_.emplace(type_id().hash_code(), sizeof(T)); -} - -static DataTypeMap* InitDataTypeMap() { - auto retv = new DataTypeMap(); - -#define RegType(cc_type, proto_type) \ - RegisterType(retv, proto_type, #cc_type) - - // NOTE: Add your customize type here. - // RegType(float16, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16); - RegType(float, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32); - RegType(double, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64); - RegType(int, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32); - RegType(int64_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64); - RegType(bool, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL); - RegType(size_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T); - RegType(int16_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16); - RegType(uint8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8); - RegType(int8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8); - -#undef RegType - return retv; -} - -_PaddleMobile__Framework__Proto__VarType__Type ToDataType(kTypeId_t type) { - auto it = gDataTypeMap().cpp_to_proto_.find(type); - if (it != gDataTypeMap().cpp_to_proto_.end()) { - return it->second; - } - PADDLE_MOBILE_THROW_EXCEPTION("Not support %d as tensor type", type); -} - -kTypeId_t ToTypeIndex(_PaddleMobile__Framework__Proto__VarType__Type type) { - auto it = gDataTypeMap().proto_to_cpp_.find(static_cast(type)); - if (it != gDataTypeMap().proto_to_cpp_.end()) { - return it->second; - } - PADDLE_MOBILE_THROW_EXCEPTION( - "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as " - "tensor type", - static_cast(type)); -} - -std::string DataTypeToString( - const _PaddleMobile__Framework__Proto__VarType__Type type) { - auto it = gDataTypeMap().proto_to_str_.find(static_cast(type)); - if (it != gDataTypeMap().proto_to_str_.end()) { - return it->second; - } - PADDLE_MOBILE_THROW_EXCEPTION( - "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as " - "tensor type", - static_cast(type)); -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/data_type.h b/mobile/src/framework/data_type.h deleted file mode 100644 index bda823ada4..0000000000 --- a/mobile/src/framework/data_type.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "common/enforce.h" -#include "common/type_define.h" -#include "framework/framework.pb-c.h" - -namespace paddle_mobile { - -namespace framework { - -_PaddleMobile__Framework__Proto__VarType__Type ToDataType(kTypeId_t type); - -kTypeId_t ToTypeIndex(_PaddleMobile__Framework__Proto__VarType__Type type); - -inline _PaddleMobile__Framework__Proto__VarType__Type ToDataType(int type) { - return static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(type); -} - -template -inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type, - Visitor visitor) { - switch (type) { - // case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16: - // visitor.template apply(); - // break; - case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32: - visitor.template apply(); - break; - case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64: - visitor.template apply(); - break; - case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32: - visitor.template apply(); - break; - case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64: - visitor.template apply(); - break; - case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL: - visitor.template apply(); - break; - case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8: - visitor.template apply(); - break; - case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16: - visitor.template apply(); - break; - case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8: - visitor.template apply(); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Not supported %d", type); - } -} - -extern std::string DataTypeToString( - const _PaddleMobile__Framework__Proto__VarType__Type type); -inline std::ostream& operator<<( - std::ostream& out, - const _PaddleMobile__Framework__Proto__VarType__Type& type) { - out << DataTypeToString(type); - return out; -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/ddim.cpp b/mobile/src/framework/ddim.cpp deleted file mode 100644 index 4f68caad77..0000000000 --- a/mobile/src/framework/ddim.cpp +++ /dev/null @@ -1,327 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "ddim.h" -#include - -namespace paddle_mobile { -namespace framework { - -/// @cond HIDDEN - -template -Dim make_dim(const int64_t *d) { - return Dim(*d, make_dim(d + 1)); -} - -template <> -Dim<0> make_dim<0>(const int64_t *d) { - return Dim<0>(0); -} - -void make_ddim(DDim &ddim, const int64_t *dims, int n) { - switch (n) { - case 0: - ddim = make_dim<0>(dims); - break; - case 1: - ddim = make_dim<1>(dims); - break; - case 2: - ddim = make_dim<2>(dims); - break; - case 3: - ddim = make_dim<3>(dims); - break; - case 4: - ddim = make_dim<4>(dims); - break; - case 5: - ddim = make_dim<5>(dims); - break; - case 6: - ddim = make_dim<6>(dims); - break; - case 7: - ddim = make_dim<7>(dims); - break; - case 8: - ddim = make_dim<8>(dims); - break; - case 9: - ddim = make_dim<9>(dims); - break; - default: - break; - } -} - -/// @endcond - -DDim make_ddim(std::initializer_list dims) { - DDim result(make_dim(0)); - make_ddim(result, dims.begin(), dims.size()); - return result; -} - -DDim make_ddim(const std::vector &dims) { - DDim result(make_dim(0)); - make_ddim(result, &dims[0], dims.size()); - return result; -} - -DDim make_ddim(const std::vector &dims) { - std::vector res(dims.size()); - std::transform(dims.begin(), dims.end(), res.begin(), - [](int d) { return static_cast(d); }); - return make_ddim(res); -} - -/// @cond HIDDEN -// XXX For some reason, putting this in an anonymous namespace causes -// errors -struct DynamicMutableIndexer : Vistor { - public: - explicit DynamicMutableIndexer(int idx) : idx_(idx) {} - - template - int64_t &operator()(Dim &dim) const { - return dim[idx_]; - } - - private: - int idx_; -}; - -struct DynamicConstIndexer : public Vistor { - public: - explicit DynamicConstIndexer(int idx) : idx_(idx) {} - - template - int64_t operator()(const Dim &dim) const { - return dim[idx_]; - } - - private: - int idx_; -}; - -/// @endcond - -int64_t &DDim::operator[](int idx) { - return DDim::ApplyVistor(DynamicMutableIndexer(idx), *this); -} - -int64_t DDim::operator[](int idx) const { - return DDim::ApplyVistor(DynamicConstIndexer(idx), *this); -} - -int DDim::size() const { return arity(*this); } - -bool DDim::operator==(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); - - if (v1.size() != v2.size()) { - return false; - } - - for (unsigned int i = 0; i < v1.size(); i++) { - if (v1[i] != v2[i]) { - return false; - } - } - - return true; - // } -} - -bool DDim::operator!=(DDim d) const { return !(*this == d); } - -DDim DDim::operator+(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); - - std::vector v3; - - PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() != v2.size()"); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] + v2[i]); - } - - return make_ddim(v3); -} - -DDim DDim::operator*(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); - - std::vector v3; - - PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() == v2.size()"); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] * v2[i]); - } - - return make_ddim(v3); -} - -int64_t get(const DDim &ddim, int idx) { return ddim[idx]; } - -void set(DDim *ddim, int idx, int value) { (*ddim)[idx] = value; } - -/// @cond HIDDEN -struct VectorizeVisitor : Vistor { - std::vector &vector; - - explicit VectorizeVisitor(std::vector &v) : vector(v) {} - - template - void operator()(const T &t) { - vector.push_back(t.head); - this->operator()(t.tail); - } - - void operator()(const Dim<0> &t) {} -}; -/// @endcond - -std::vector vectorize(const DDim &ddim) { - std::vector result; - VectorizeVisitor visitor(result); - DDim::ApplyVistor(visitor, ddim); - return result; -} - -// NOTE: framework::vectorize converts to type int64_t -// which does not fit cudnn inputs. -std::vector vectorize2int(const DDim &ddim) { - std::vector temp = vectorize(ddim); - std::vector result(temp.begin(), temp.end()); - return result; -} - -struct ProductVisitor : Vistor { - template - int64_t operator()(const Dim &dim) { - return product(dim); - } -}; - -int64_t product(const DDim &ddim) { - ProductVisitor visitor; - return DDim::ApplyVistor(visitor, ddim); -} - -struct SliceVectorizeVisitor : Vistor { - std::vector &vector; - int begin; - int end; - - SliceVectorizeVisitor(std::vector &v, int b, int e) - : vector(v), begin(b), end(e) { - PADDLE_MOBILE_ENFORCE( - begin < end, "Begin index must be less than end index in ddim slice."); - PADDLE_MOBILE_ENFORCE(begin >= 0, - "Begin index can't be less than zero in ddim slice."); - } - - template - void operator()(const Dim &dim) { - if (begin == 0) { - vector.push_back(dim.head); - } else { - --begin; - } - --end; - if (end > 0) { - this->operator()(dim.tail); - } - } - - void operator()(const Dim<0> &dim) { - // PADDLE_ENFORCE(end == 0, "End index in ddim slice is out - // of bound."); - } -}; - -DDim slice_ddim(const DDim &ddim, int begin, int end) { - std::vector vec; - vec.reserve(end - begin); - SliceVectorizeVisitor visitor(vec, begin, end); - DDim::ApplyVistor(visitor, ddim); - return make_ddim(vec); -} - -/// \cond HIDDEN - -struct ArityVisitor : Vistor { - template - int operator()(Dim) const { - return D; - } -}; - -/// \endcond - -int arity(const DDim &d) { - ArityVisitor arityVisitor = ArityVisitor(); - return DDim::ApplyVistor(arityVisitor, d); -} - -#ifdef PADDLE_MOBILE_DEBUG -Print &operator<<(Print &printer, const DDim &ddim) { - for (int j = 0; j < ddim.size(); ++j) { - printer << ddim[j] << " "; - } - - return printer; -} - -#endif - -DDim::DDim(std::initializer_list init_list) { - *this = make_ddim(init_list); -} - -DDim flatten_to_2d(const DDim &src, int num_col_dims) { - int rank = src.size(); - return make_ddim({product(slice_ddim(src, 0, num_col_dims)), - product(slice_ddim(src, num_col_dims, rank))}); -} - -DDim flatten_to_1d(const DDim &src) { return make_ddim({product(src)}); } - -DDim stride(const DDim &ddim) { - std::vector strides(ddim.size()); - strides[ddim.size() - 1] = 1; - for (int i = ddim.size() - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * ddim[i + 1]; - } - return framework::make_ddim(strides); -} - -DDim stride_numel(const framework::DDim &ddim) { - std::vector strides(ddim.size()); - strides[ddim.size() - 1] = ddim[ddim.size() - 1]; - for (int i = ddim.size() - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * ddim[i]; - } - return framework::make_ddim(strides); -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/ddim.h b/mobile/src/framework/ddim.h deleted file mode 100644 index 5d3844be78..0000000000 --- a/mobile/src/framework/ddim.h +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "common/enforce.h" -#include "common/variant.h" -#include "framework/dim.h" - -namespace paddle_mobile { -namespace framework { - -/** - * \brief A dynamically sized dimension. - * - * The number of dimensions must be between [1, 9]. - */ -struct DDim { - typedef Variant, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, - Dim<7>, Dim<8>, Dim<9>> - DDimVar; - DDimVar var; - - template - static typename Vistor::type_t ApplyVistor(Vistor vistor, const DDim &d) { - if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return vistor(d.var.Get>()); - } else { - PADDLE_MOBILE_ENFORCE(false, " dim not support"); - } - } - - DDim() { var.Set>(Dim<1>()); } - - template - explicit DDim(const Dim &in) { - var.Set>(in); - } - - DDim(const DDim &in) { setNewDim(in); } - - /*implicit*/ DDim(std::initializer_list init_list); - - template - DDim &operator=(const Dim &in) { - var.Set>(in); - return *this; - } - - DDim &operator=(const DDim &in) { - setNewDim(in); - return *this; - } - - void setNewDim(const DDim &d) { - if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else if (d.var.TypeId() == type_id>()) { - return var.Set>(d.var.Get>()); - } else { - PADDLE_MOBILE_ENFORCE(false, " dim not support"); - } - } - - int64_t &operator[](int idx); - - int64_t operator[](int idx) const; - - DDimVar getVar() const { return var; } - - bool operator==(DDim d) const; - - bool operator!=(DDim d) const; - - DDim operator+(DDim d) const; - - DDim operator*(DDim d) const; - - int size() const; -}; - -/** - * \brief Make a DDim from std::vector - * - * \param dims An vector of ints. Must be sized between [1, 9] - */ -DDim make_ddim(const std::vector &dims); - -DDim make_ddim(const std::vector &dims); - -/** - * \brief Make a DDim from an initializer list - * - * \param dims An initializer list of ints. Must be sized between [1, 9] - * - */ -DDim make_ddim(std::initializer_list dims); - -int64_t get(const DDim &dim, int idx); - -void set(DDim *dim, int idx, int val); - -std::vector vectorize(const DDim &ddim); - -std::vector vectorize2int(const DDim &ddim); - -int64_t product(const DDim &ddim); - -/** - * \brief Slice a ddim - * - * Slice dim with [begin, end). - * e.g. DDim d = make_ddim({1,2,3,4,5}); - * slice_ddim(d, 1, 3); ====> {2,3} - */ -DDim slice_ddim(const DDim &dim, int begin, int end); - -/** - * \brief What is the length of this dimension? - * - * \param Dynamic dimension to inspect - */ - -int arity(const DDim &ddim); - -// Reshape a tensor to a matrix. The matrix's first dimension(column -// length) -// will be the product of tensor's first `num_col_dims` dimensions. -DDim flatten_to_2d(const DDim &src, int num_col_dims); - -DDim flatten_to_1d(const DDim &src); - -DDim stride(const DDim &ddim); - -DDim stride_numel(const DDim &ddim); - -#ifdef PADDLE_MOBILE_DEBUG -Print &operator<<(Print &printer, const DDim &ddim); -#endif -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/dim.h b/mobile/src/framework/dim.h deleted file mode 100644 index e11d6fe39a..0000000000 --- a/mobile/src/framework/dim.h +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "common/enforce.h" -namespace paddle_mobile { -namespace framework { - -// Statically sized, statically indexed dimension -template -struct Dim { - static constexpr int dimensions = i; - - template - Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { - static_assert(sizeof...(_tail) == i - 1, - "Dim initialized with the wrong number of parameters"); - } - - Dim(int64_t _head, const Dim &_tail) : head(_head), tail(_tail) {} - - Dim() : head(0), tail() {} - - /** Construct a Dim from a linear index and size. Uses Fortran - * order - * indexing. */ - Dim(int64_t idx, const Dim &size) - : head(idx % size.head), tail(idx / size.head, size.tail) {} - - /** Construct a Dim with each dimension set to the given index */ - explicit Dim(int64_t idx) : head(idx), tail(idx) {} - - bool operator==(const Dim &o) const { - return (head == o.head) && (tail == o.tail); - } - - bool operator!=(const Dim &o) const { return !(*this == o); } - - int64_t &operator[](int idx); - - int64_t operator[](int idx) const; - - std::string to_string() const; - - int64_t head; - Dim tail; -}; - -// Base case specialization -template <> -struct Dim<0> { - static constexpr int dimensions = 0; - - explicit Dim(int64_t _head) {} - - Dim() {} - - Dim(int idx, const Dim<0> &size) { - if (idx > 0) { - PADDLE_MOBILE_THROW_EXCEPTION("Index out of range.") - } - } - - bool operator==(const Dim<0> &o) const { return true; } - - bool operator!=(const Dim<0> &o) const { return false; } - - int64_t &operator[](int idx); - - int64_t operator[](int idx) const; - - int64_t head; -}; - -namespace { - -// Helper for accessing Dim classes -template -struct DimGetter { - // Return a copy if Dim is const - template - static int64_t impl(const D &d) { - return DimGetter::impl(d.tail); - } - // Return a reference if Dim is mutable - template - static int64_t &impl(D &d) { - return DimGetter::impl(d.tail); - } -}; - -// Eureka! We found the element! -template <> -struct DimGetter<0> { - // Return a copy if Dim is const - template - static int64_t impl(const D &d) { - return d.head; - } - // Return a reference if Dim is mutable - template - static int64_t &impl(D &d) { - return d.head; - } -}; - -template -int64_t &indexer(Dim &dim, int idx) { - if (idx < 0) { - PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension") - } - - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); -} - -template <> -int64_t &indexer<0>(Dim<0> &dim, int idx) { - PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") - return dim.head; -} - -template -int64_t indexer(const Dim &dim, int idx) { - if (idx < 0) { - PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension") - } - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); -} - -template <> -int64_t indexer<0>(const Dim<0> &dim, int idx) { - PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") - return dim.head; -} - -} // namespace -// Static access to constant Dim -template -int64_t get(const Dim &d) { - return DimGetter::impl(d); -} - -// Static access to mutable Dim -template -int64_t &get(Dim &d) { - return DimGetter::impl(d); -} - -// Dynamic access to constant Dim -template -int64_t Dim::operator[](int i) const { - // std::cout << "l: " << l << std::endl; - return indexer(*this, i); -} - -// Dynamic access to mutable Dim -template -int64_t &Dim::operator[](int i) { - return indexer(*this, i); -} - -// Dynamic access to constant Dim -inline int64_t Dim<0>::operator[](int i) const { return indexer(*this, i); } - -// Dynamic access to mutable Dim -inline int64_t &Dim<0>::operator[](int i) { return indexer(*this, i); } - -// Dynamic access to constant Dim -// without std::enable_if will try to instantiate this on get<0>(d) -template -typename std::enable_if<(l > 0), int64_t>::type get(const Dim &d, int i) { - return d[i]; -} - -// Dynamic access to mutable Dim -template -typename std::enable_if<(l > 0), int64_t &>::type get(Dim &d, int i) { - return d[i]; -} - -// Dot product of two dims -template -int64_t linearize(const Dim &a, const Dim &b) { - return a.head * b.head + linearize(a.tail, b.tail); -} - -// Base case dot product of two Dims -// Notice it is inline because it is no longer a template -template <> -inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) { - return 0; -} - -// Product of a Dim -template -int64_t product(const Dim &a, int prod = 1) { - return prod * a.head * product(a.tail); -} - -// Base case product of a Dim -// Notice it is inline because it is no longer a template -template <> -inline int64_t product(const Dim<0> &a, int prod) { - return prod; -} - -// Is 0 <= idx_i < size_i for all i? -template -bool contained(const Dim &idx, const Dim &size) { - return ((0 <= idx.head) && (idx.head < size.head) && - contained(idx.tail, size.tail)); -} - -// Base case of is 0 <= idx_i < size_i ? -// Notice it is inline because it is no longer a template -template <> -inline bool contained(const Dim<0> &idx, const Dim<0> &size) { - return true; -} - -/** - * \brief Compute exclusive prefix-multiply of a Dim. - */ -template -Dim ex_prefix_mul(const Dim &src, int mul = 1) { - return Dim(mul, ex_prefix_mul(src.tail, mul * src.head)); -} - -///\cond HIDDEN -// Base case of ex_prefix_mul -// Notice it is inline because it is no longer a template -template <> -inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) { - return Dim<0>(); -} -///\endcond - -/** - * Add two dimensions together - */ -template -Dim dim_plus(const Dim &a, const Dim &b) { - return Dim(a.head + b.head, dim_plus(a.tail, b.tail)); -} - -// Base case -template <> -inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) { - return Dim<0>(); -} - -template -Dim operator+(const Dim &lhs, const Dim &rhs) { - return dim_plus(lhs, rhs); -} - -/** - * Multiply two dimensions together - */ -template -Dim dim_mult(const Dim &a, const Dim &b) { - return Dim(a.head * b.head, dim_mult(a.tail, b.tail)); -} - -// Base case -template <> -inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) { - return Dim<0>(); -} - -template -Dim operator*(const Dim &lhs, const Dim &rhs) { - return dim_mult(lhs, rhs); -} - -/** - * \brief Normalize strides to ensure any dimension with extent 1 - * has stride 0. - * - * \param size Dim object containing the size of an array - * \param stride Dim object containing stride of an array - * \return Dim object the same size as \p size with normalized strides - * - */ - -template -Dim normalize_strides(const Dim &size, const Dim &stride) { - int norm_stride = size.head == 1 ? 0 : stride.head; - return Dim(norm_stride, normalize_strides(size.tail, stride.tail)); -} - -///\cond HIDDEN - -template <> -inline Dim<0> normalize_strides(const Dim<0> &size, const Dim<0> &stride) { - return Dim<0>(); -} - -///\endcond - -/** - * Helper function to create a Dim - * - * \param idxes The type of Dim constructed depends on the number of - * params - * - */ - -template -Dim make_dim(Args... idxes) { - return Dim(idxes...); -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp deleted file mode 100644 index cda5c5522c..0000000000 --- a/mobile/src/framework/executor.cpp +++ /dev/null @@ -1,1125 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/executor.h" -#include -#include -#include -#include -#include "common/enforce.h" -#include "common/log.h" -#include "framework/context.h" -#include "framework/framework.pb-c.h" -#include "framework/lod_tensor.h" -#include "framework/operator.h" -#include "framework/program/program-optimize/program_optimize.h" -#include "framework/program/program_desc.h" -#include "framework/program/var_desc.h" -#include "framework/scope.h" -#include "framework/tensor.h" -#include "memory/t_malloc.h" -#include "pass/memory_optimize.h" -#include "pass/model_obfuscate.h" -#ifdef PADDLE_MOBILE_CL -#include "framework/cl/cl_image.h" -#include "pass/memory_optimize_cl.h" -#endif - -namespace paddle_mobile { -namespace framework { - -#pragma mark - executor - -template -void Executor::SetThreadNum(int thread_num, PowerMode power_mode) { - CPUContext::Context()->set_thread_num(thread_num, power_mode); -} - -template -Executor::Executor(const Program &program, - paddle_mobile::PaddleMobileConfigInternal config, - int batch_size, const bool use_optimize, - const bool lod_mode) - : program_(program), - batch_size_(batch_size), - use_optimize_(use_optimize), - lod_mode_(lod_mode), - config_(config) { - DLOG << "executor in lod mode: " << lod_mode; - - Variable *variable_ptr = program_.scope->Var("batch_size"); - variable_ptr->SetValue(batch_size); - - program_desc_ = - use_optimize_ ? program_.optimizeProgram : program_.originProgram; - PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr, - "program_desc_ should not be nullptr"); -#if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_FPGA_KD) && \ - !defined(PADDLE_MOBILE_CL) - if (config_.memory_optimization_level != NoMemoryOptimization) { - pass::MemoryOptPass()(program_desc_.get(), program_.scope.get(), - config_.memory_optimization_level); - } -#endif - // resize feed and fetch list - // should init feed and fetch variables before infer shape - InitFeedFetchList(); - const auto &blocks = program_desc_->Blocks(); - std::shared_ptr block_desc = blocks[0]; - std::vector> ops = block_desc->Ops(); - for (int j = 0; j < ops.size(); ++j) { - std::shared_ptr op_desc = ops[j]; - LOG(kLOG_INFO) << "create op[" << j << "]: " << op_desc->Type(); - - auto op_handler = OpRegistry::CreateOp( - op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(), - op_desc->GetAttrMap(), program_.scope.get()); - // infer shape to reshape inputs and outputs before predict, - // but for lod mode, it still need to infer shape in runtime - if (!lod_mode) { - op_handler->InferShape(); - } - ops_of_block0_.push_back(op_handler); - } -#ifdef PADDLE_MOBILE_FPGA_V2 - InitQuantMemory(); -#endif - if (program_.combined) { - InitCombineMemory(); - } else { - InitMemory(); - } - int count = 0; -#ifdef PADDLE_MOBILE_PROFILE - std::vector profile(ops_of_block0_.size()); - struct timespec ts; - int op_index = 0; -#endif - for (auto &op_handler : ops_of_block0_) { -#ifdef PADDLE_MOBILE_PROFILE - clock_gettime(CLOCK_MONOTONIC, &ts); - profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; -#endif - LOG(kLOG_INFO) << "Initialize op[" << count++ - << "]: " << op_handler->Type(); - if (op_handler->Type() == "feed" || op_handler->Type() == "fetch") { - op_handler->setPrePostType(config_.pre_post_type); - } - op_handler->Init(); -#ifdef PADDLE_MOBILE_PROFILE - clock_gettime(CLOCK_MONOTONIC, &ts); - profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; - ++op_index; -#endif - } -#ifdef PADDLE_MOBILE_PROFILE - printf("================[ op init profile ]==================\n"); - PrintProfile(profile); -#endif - ApplyMemoryOptimise(config, lod_mode); -} - -template -void Executor::ApplyMemoryOptimise( - const PaddleMobileConfigInternal &config, const bool lod_mode) const {} - -#ifdef PADDLE_MOBILE_CL -template <> -void Executor::ApplyMemoryOptimise( - const PaddleMobileConfigInternal &config, const bool lod_mode) const { - if (!config.load_when_predict && !lod_mode && - config_.memory_optimization_level != NoMemoryOptimization) { - pass::MemoryOptPassCl()(program_desc_.get(), program_.scope.get(), - config_.memory_optimization_level); - } -} -#endif - -template -void Executor::InitFeedFetchList() { - std::unordered_map feed_indices, fetch_indices; - for (const auto &block : program_desc_->Blocks()) { - for (const auto &op_desc : block->Ops()) { - if (op_desc->Type() == "feed") { - std::string name = op_desc->Output("Out")[0]; - feed_indices[name] = op_desc->GetAttr("col").Get(); - } else if (op_desc->Type() == "fetch") { - std::string name = op_desc->Input("X")[0]; - fetch_indices[name] = op_desc->GetAttr("col").Get(); - } - } - } - feed_indices_.swap(feed_indices); - fetch_indices_.swap(fetch_indices); - - auto *feed_var = program_.scope->Var("feed"); - auto *feed_list = feed_var->template GetMutable(); - feed_list->resize(feed_indices_.size()); - - auto *fetch_var = program_.scope->Var("fetch"); - auto *fetch_list = - fetch_var->template GetMutable(); - fetch_list->resize(fetch_indices_.size()); -} - -template -static void LoadMemInternal(void **in_data, void *out_data, int64_t size, - bool quant_uint8 = false, int quant_fold = 1) { - char **data_buf = reinterpret_cast(in_data); - T *tensor_data = reinterpret_cast(out_data); - if (quant_uint8) { - const int minimal_fold_size = 2; - quant_fold = fmin(fmax(1, size / minimal_fold_size), quant_fold); - int step = fmax(size / quant_fold, 1); - int visited_fold = 0; - while (visited_fold * step < size) { - // should be moved into operator init function - float min_value; - float max_value; - memory::Copy(&min_value, *data_buf, sizeof(float)); - memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float)); - *data_buf += 2 * sizeof(float); - const float factor = (max_value - min_value) / 255.0; - const uint8_t *uint8_data = reinterpret_cast(*data_buf); - int k = 0; - for (; k < step; ++k) { - int tensor_data_idx = visited_fold * step + k; - if (tensor_data_idx >= size) { - break; - } - tensor_data[tensor_data_idx] = uint8_data[k] * factor + min_value; - } - *data_buf += k * sizeof(uint8_t); - visited_fold++; - } - } else { - memory::Copy(tensor_data, *data_buf, size * sizeof(T)); - *data_buf += size * sizeof(T); - } -} - -template -void Executor::LoadMemory(void **data, - const std::shared_ptr var_desc, - LoDTensor *tensor) { - char **data_buf = reinterpret_cast(data); - // version - uint32_t version = *(reinterpret_cast(*data_buf)); - *data_buf += sizeof(uint32_t); - // lod information - // uint64_t lod_level = *(reinterpret_cast(*data_buf)); - uint64_t lod_level = 0; - memory::Copy(&lod_level, *data_buf, sizeof(uint64_t)); - *data_buf += sizeof(uint64_t); - - auto *lod = tensor->mutable_lod(); - lod->resize(lod_level); - for (uint64_t i = 0; i < lod_level; ++i) { - uint64_t size = *(reinterpret_cast(*data_buf)); - *data_buf += sizeof(uint64_t); - std::vector tmp_dim(size / sizeof(size_t)); - memory::Copy(tmp_dim.data(), *data_buf, size); - (*lod)[i] = std::move(tmp_dim); - *data_buf += size; - } - // tensor version - uint32_t tensor_version = *(reinterpret_cast(*data_buf)); - *data_buf += sizeof(uint32_t); - // tensor desc size - int32_t tensor_desc_size = *(reinterpret_cast(*data_buf)); - *data_buf += sizeof(int32_t); - // skip tensor desc - *data_buf += tensor_desc_size; - - const TensorDesc &tensor_desc = var_desc->Tensor_desc(); - tensor->Resize(make_ddim(tensor_desc.Dims())); - // parse tensor from stream - switch (tensor_desc.DataType()) { - case VARTYPE_TYPE_FP32: - LoadMemInternal( - reinterpret_cast(data_buf), - reinterpret_cast(tensor->mutable_data()), tensor->numel(), - program_.quantification, program_.quantification_fold); - break; - case VARTYPE_TYPE_INT8: - LoadMemInternal( - reinterpret_cast(data_buf), - reinterpret_cast(tensor->mutable_data()), tensor->numel()); - break; - case VARTYPE_TYPE_INT32: - LoadMemInternal(reinterpret_cast(data_buf), - reinterpret_cast(tensor->mutable_data()), - tensor->numel()); - break; - default: - LOG(kLOG_ERROR) << "data type is not supported"; - } -} - -template -void Executor::InitMemory() { - for (const auto &block : program_desc_->Blocks()) { - for (const auto &var_desc : block->Vars()) { - auto var = program_.scope->Var(var_desc->Name()); - if (var_desc->Persistable()) { - if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { - var->template GetMutable(); - continue; - } - DLOG << "init persistable var: " << var_desc->Name(); - char *origin_data = - ReadFileToBuff(program_.model_path + "/" + var_desc->Name()); - char *data = origin_data; - auto tensor = var->template GetMutable(); - LoadMemory(reinterpret_cast(&data), var_desc, tensor); - delete[] origin_data; - } else { - DLOG << "init no persistable var: " << var_desc->Name(); - varInputMemory(var_desc, var); - } - } - } -} - -template -void Executor::InitCombineMemory() { - char *origin_data = nullptr; - bool self_alloc = false; - if (program_.combined_params_buf && program_.combined_params_len) { - origin_data = reinterpret_cast( - const_cast(program_.combined_params_buf)); - if (config_.model_obfuscate_key != "") { - auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key); - obfuscator.convert_data(origin_data, program_.combined_params_len); - } - } else { - self_alloc = true; - origin_data = ReadFileToBuff(program_.para_path); - if (config_.model_obfuscate_key != "") { - auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key); - obfuscator.convert_data(origin_data, GetFileLength(program_.para_path)); - } - } - PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr"); - char *data = origin_data; - for (const auto &block : program_desc_->Blocks()) { - for (const auto &var_desc : block->Vars()) { - auto var = program_.scope->Var(var_desc->Name()); - if (var_desc->Persistable()) { - if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { - var->template GetMutable(); - continue; - } - - DLOG << " init combine memory persistable: " << var_desc->Name(); - auto tensor = var->template GetMutable(); - LoadMemory(reinterpret_cast(&data), var_desc, tensor); - } else { - DLOG << " init combine memory no persistable: " << var_desc->Name(); - varInputMemory(var_desc, var); - } - } - } - if (self_alloc) { - delete[] origin_data; - } - LOG(kLOG_INFO) << "init combine memory finish"; -} - -static void ClearNoPersistableTensorArray(const framework::ProgramDesc *program, - framework::Scope *scope) { - for (const auto &block : program->Blocks()) { - for (const auto &var_desc : block->Vars()) { - if (!var_desc->Persistable() && - var_desc->Type() == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) { - auto var = scope->Var(var_desc->Name()); - auto array = var->template GetMutable(); - array->resize(1); - } - } - } -} - -template -void Executor::InitNoPersistableMemory(const Tensor &input_tensor) { - if (input_tensor.dims().size() != 4) { - return; - } - for (const auto &block : program_desc_->Blocks()) { - for (const auto &var_desc : block->Vars()) { - auto var = program_.scope->Var(var_desc->Name()); - if (!var_desc->Persistable() && - var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { - DLOG << "InitNoPersistableMemory var " << var_desc->Name(); - auto tensor = var->template GetMutable(); - if (tensor->IsInitialized() && tensor->dims().size() == 4) { - // don't change user's input and avoid memory leaks - if (feed_indices_.find(var_desc->Name()) != feed_indices_.end()) { - break; - } - DDim tensor_dim = tensor->dims(); - DDim new_dim = - make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2], - input_tensor.dims()[3]}); - tensor->Resize(new_dim); - tensor->template mutable_data_new(); - DLOG << "var's tensor dims " << tensor_dim; - DLOG << "var's tensor new dims " << new_dim; - } else { - DLOG << "var's tensor is not Initialized ???"; - } - } - } - } -} - -template -bool Executor::varInputMemory( - const std::shared_ptr &var_desc, Variable *var) const { -#ifdef PADDLE_MOBILE_FPGA - framework::LoDTensor *tensor = var->template GetMutable(); -#ifdef PADDLE_MOBILE_FPGA_V2 - tensor->init(type_id().hash_code()); -#else - tensor->init(type_id().hash_code()); -#endif - return true; -#endif - - auto type = var_desc->Type(); - if (type == VARTYPE_TYPE_LOD_TENSOR) { - auto data_type = var_desc->Tensor_desc().DataType(); - framework::LoDTensor *tensor = var->template GetMutable(); - } else if (type == VARTYPE_TYPE_STEP_SCOPES) { - std::vector *step_scopes = - var->template GetMutable>(); - } else if (type == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) { - framework::LoDTensorArray *tensor_array = - var->template GetMutable(); - } else { - PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type); - } - return true; -} - -template -PMStatus Executor::Predict( - const std::vector> &inputs) { - for (const auto &input : inputs) { - SetInput(input.second, input.first); - } - return this->Predict(); -} - -template -PMStatus Executor::Predict( - const std::vector> &inputs) { - for (const auto &input : inputs) { - SetInput(input.second, input.first); - } - return this->Predict(); -} - -template -std::vector Executor::Predict(const std::vector &input, - const std::vector &dims) { - PADDLE_MOBILE_ENFORCE(feed_indices_.size() != 0, - "We don't know which tensor should be assign, since no " - "feed op found in this model"); - PADDLE_MOBILE_ENFORCE(fetch_indices_.size() != 0, - "We don't know which tensor should be fetch out, since " - "no fetch op found in this model"); - std::string input_name = feed_indices_.begin()->first; - Tensor feed_tensor(input, make_ddim(dims)); - SetInput(feed_tensor, input_name); - std::vector output; - if (this->Predict() == PMSuccess) { - std::string output_name = fetch_indices_.begin()->first; - const auto output_tensor = GetOutput(output_name); - output.resize(output_tensor->numel()); - memcpy(output.data(), output_tensor->template data(), - output.size() * sizeof(T)); - } - return output; -} - -template -void Executor::SetInput(const Tensor &input, - const std::string &var_name) { - int index = 0; - if (feed_indices_.find(var_name) != feed_indices_.end()) { - index = feed_indices_.find(var_name)->second; - } - auto *feed_var = program_.scope->Var("feed"); - framework::LoDTensor &target = - feed_var->template GetMutable()->at(index); - - target.Resize(input.dims()); - target.ShareDataWith(input); - if (feed_indices_.size() == 1) { - auto &dim = input.dims(); - if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) { - InitNoPersistableMemory(target); - } - input_dim_has_changed_ = input_dim_last_ != dim; - input_dim_last_ = static_cast(dim); - } -} - -template -void Executor::SetInput(const LoDTensor &input, - const std::string &var_name) { - int index = 0; - if (feed_indices_.find(var_name) != feed_indices_.end()) { - index = feed_indices_.find(var_name)->second; - } - auto *feed_var = program_.scope->Var("feed"); - framework::LoDTensor &target = - feed_var->template GetMutable()->at(index); - - target.Resize(input.dims()); - target.ShareDataWith(input); - target.set_lod(input.lod()); - if (feed_indices_.size() == 1) { - auto &dim = input.dims(); - if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) { - InitNoPersistableMemory(target); - } - input_dim_has_changed_ = input_dim_last_ != dim; - input_dim_last_ = static_cast(dim); - } -} - -template -std::shared_ptr Executor::GetOutput( - const std::string &var_name) { - const auto &iter = fetch_indices_.find(var_name); - if (var_name == "fetch" || iter != fetch_indices_.end()) { - int index = 0; - if (iter != fetch_indices_.end()) { - index = iter->second; - } - auto *fetch_var = program_.scope->Var("fetch"); - framework::LoDTensor &target = - fetch_var->template GetMutable()->at(index); - - return std::make_shared(target); - } else { - auto *fetch_var = program_.scope->Var(var_name); - framework::LoDTensor *target = - fetch_var->template GetMutable(); - return std::make_shared(*target); - } -} - -#ifdef PADDLE_MOBILE_CL -template -const CLImage *Executor::GetOutputImage( - const std::string &var_name) { - auto var = program_.scope->FindVar(var_name); - if (var->IsInitialized() && var->template IsType()) { - const CLImage *cl_image = var->template Get(); - return cl_image; - } else { - return nullptr; - } -} -#endif - -template -PMStatus Executor::Predict() { - try { -#if _OPENMP - omp_set_num_threads(CPUContext::Context()->get_thread_num()); -#endif - // clear all no persistable tensor array since write_to_array - // is always push back a new tensor in the array - ClearNoPersistableTensorArray(program_desc_.get(), program_.scope.get()); - -#ifdef PADDLE_MOBILE_PROFILE - std::vector profile(ops_of_block0_.size()); - struct timespec ts; - int op_index = 0; -#endif - for (int i = 0; i < ops_of_block0_.size(); ++i) { - auto &op_handler = ops_of_block0_[i]; -#ifdef PADDLE_MOBILE_PROFILE - clock_gettime(CLOCK_MONOTONIC, &ts); - profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; -#endif - LOG(paddle_mobile::kLOG_INFO) << i << "th, " - << "run op: " << op_handler->Type(); - if (lod_mode_ && input_dim_has_changed_) { - op_handler->InferShape(); - } - op_handler->Run(); -#ifdef PADDLE_MOBILE_PROFILE - clock_gettime(CLOCK_MONOTONIC, &ts); - profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; - ++op_index; -#endif - } - if (feed_indices_.size() == 1) { - input_dim_has_changed_ = false; - } - -#ifdef PADDLE_MOBILE_PROFILE - PrintProfile(profile); -#endif - return PMSuccess; - } catch (PaddleMobileException &e) { - exception_msg_ = e.what(); - return PMException; - } catch (std::exception &e) { - exception_msg_ = e.what(); - return PMException; - } -} - -#ifdef PADDLE_MOBILE_PROFILE -template -void Executor::PrintProfile( - const vector::ProfInfo> &profile) const { - std::unordered_map _tp; - for (int i = 0; i < profile.size(); i++) { - const auto &pInfo = profile[i]; - uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; - if (this->ops_of_block0_[i]->Type() == "conv2d" || - this->ops_of_block0_[i]->Type() == "depthwise_conv2d") { - auto inputs = this->ops_of_block0_[i]->Inputs(); - - auto *filter = GetVarValue("Filter", inputs, - *(this->program_.scope)); - int kernel_size = filter->dims()[2]; - _tp[this->ops_of_block0_[i]->Type() + "_" + - std::to_string(kernel_size)] += timeCost; - } else { - _tp[this->ops_of_block0_[i]->Type()] += timeCost; - } - } - printf("====================[ profile ]======================\n"); - typedef std::pair prof_t; - std::vector _tv(_tp.begin(), _tp.end()); - uint64_t _ptotal = 0; - for (auto const &p : _tv) { - _ptotal += p.second; - } - auto compf = [](const prof_t &a, const prof_t &b) { - return a.second > b.second; - }; - std::sort(_tv.begin(), _tv.end(), compf); - _tv.push_back(std::make_pair("total", _ptotal)); - for (auto const &p : _tv) { - printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), - static_cast(p.second), - static_cast(p.second) / _ptotal * 100.0); - } - printf("====================[---------]======================\n"); -} -#endif - -template -void Executor::FeedTensorData(const vector &v) { - auto input_size = v.size(); - auto *feed_var = program_.scope->Var("feed"); - - PADDLE_MOBILE_ENFORCE(input_size == feed_indices_.size(), - "input data number not correct"); - for (int i = 0; i < input_size; i++) { - framework::LoDTensor &target = - feed_var->template GetMutable()->at(i); - target.ShareDataWith(v[input_size - i - 1]); - } -} - -template -void Executor::GetTensorResults( - std::vector *v) { - auto *fetch_var = program_.scope->Var("fetch"); - auto output_size = fetch_indices_.size(); - for (int i = 0; i < output_size; i++) { - framework::LoDTensor &target = - fetch_var->template GetMutable()->at(i); - v->push_back(&target); - } -} - -template -std::string Executor::GetExceptionMsg() { - return exception_msg_; -} - -#ifdef PADDLE_MOBILE_FPGA -template -void Executor::InjectVariable(const Tensor &t, - std::string var_name) { - Variable *g_feed_value = program_.scope->Var(var_name); - Tensor *feed_tensor = g_feed_value->template GetMutable(); - feed_tensor->Resize(t.dims()); - feed_tensor->ShareDataWith(t); -} - -template -void Executor::FeedData(const Tensor &t) { - InjectVariable(t, "feed0"); -} - -template -void Executor::FeedData(const std::vector &v) { - auto input_size = v.size(); - int index = 0; - // auto vars = program_.scope->VarContain("feed", &index); - // PADDLE_MOBILE_ENFORCE(input_size == vars.size(), - // "input data number not correct"); - for (int i = 0; i < input_size; i++) { - auto var = program_.scope->Var("feed", i + index); - auto feed_tensor = var->template GetMutable(); - feed_tensor->external_data = v[i]; - } -} - -template -void Executor::GetResults(std::vector *v) { - auto output_size = v->size(); - PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output"); - int index = 0; - auto vars = program_.scope->VarContain("fetch", &index); - PADDLE_MOBILE_ENFORCE(output_size == vars.size(), - "output data number not correct"); - - for (int i = 0; i < output_size; i++) { - auto var = program_.scope->Var("fetch", i + index); - auto fetch_tensor = var->template GetMutable(); - (*v)[i] = fetch_tensor->template data(); - } -} - -template -framework::Tensor *Executor::GetTensorByName( - const std::string &name) { - auto var = program_.scope->Var(name); - return var->template GetMutable(); -} - -template -std::shared_ptr Executor::FetchResult(int id) { - auto &ops = ops_of_block0_; - - PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range"); - auto op = id < 0 ? ops[ops.size() - 1] : ops[id]; - auto output_map = op->Outputs(); - std::vector out_keys = op->GetOutKeys(); - PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output"); - auto *output_tensor = - GetVarValue(out_keys[0], output_map, *(program_.scope)); - return std::make_shared(Tensor(*output_tensor)); -} - -template -void Executor::Predict_From_To(int start, int end) { - auto &ops = ops_of_block0_; - end = end < 0 ? static_cast(ops.size()) : end; - PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(), - "start or end parameter is wrong"); - -#ifdef PADDLE_MOBILE_PROFILE - std::vector profile(ops.size()); -#endif - for (int i = start; i < end; i++) { -#ifdef PADDLE_MOBILE_PROFILE - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; -#endif - DLOG << "Running op: " << i << " " << ops[i]->Type(); - ops[i]->Run(); - -#ifdef PADDLE_MOBILE_PROFILE - clock_gettime(CLOCK_MONOTONIC, &ts); - profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; -#endif - } -} - -template -void Executor::Predict_From(int start) { - Predict_From_To(start); -} - -template -void Executor::Predict_To(int end) { - Predict_From_To(0, end); -} -#ifdef PADDLE_MOBILE_FPGA_V2 -std::map LoadQuantValFromFile(std::string filename) { - std::map quantValList; - std::ifstream in; - in.open(filename, std::ios::in); - if (!in.is_open()) { - // std::cout << "open File Failed." << std::endl; - DLOG << "open File Failed."; - exit(-1); - } - - std::string line; - while (getline(in, line)) { - std::string splitStr = " : "; - std::string::size_type pos; - pos = line.find(splitStr); - std::string subStr[2]; - subStr[0] = line.substr(0, pos); - subStr[1] = line.substr(pos + splitStr.size(), line.size()); - quantValList.insert(std::make_pair(subStr[0], atof(subStr[1].c_str()))); - } - in.close(); - return quantValList; -} - -template -void Executor::InitQuantMemory() { - std::string quantValFilePath; - if (program_.combined) { - quantValFilePath = program_.para_path; - quantValFilePath = - quantValFilePath.substr(0, (quantValFilePath.length() - 6)); - quantValFilePath = quantValFilePath + "scale"; - } else { - quantValFilePath = program_.model_path + "/scale"; - } - std::map quantValList = - LoadQuantValFromFile(quantValFilePath); - auto ops = ops_of_block0_; - for (int id = 0; id < ops.size(); id++) { - auto op = ops[id]; - auto input_keys = op->GetInputKeys(); - auto inputs = op->Inputs(); - for (auto key = input_keys.begin(); key != input_keys.end(); key++) { - auto inputs_vars = inputs[*key]; - int count = inputs_vars.size(); - for (int i = 0; i < count; i++) { - if (inputs_vars[i] != "feed") { - auto tensor = GetTensorByName(inputs_vars[i]); - tensor->scale[0] = quantValList[inputs_vars[i]]; - DLOG << "input variance name : " << inputs_vars[i] - << ", scale value : " << tensor->scale[0]; - } - } - } - auto output_keys = op->GetOutKeys(); - auto outputs = op->Outputs(); - for (auto key = output_keys.begin(); key != output_keys.end(); key++) { - auto outputs_vars = outputs[*key]; - int count = outputs_vars.size(); - for (int i = 0; i < count; i++) { - if (outputs_vars[i] != "fetch") { - auto tensor = GetTensorByName(outputs_vars[i]); - tensor->scale[0] = quantValList[outputs_vars[i]]; - DLOG << "output variance name : " << outputs_vars[i] - << ", scale value : " << tensor->scale[0]; - } - } - } - } -} -#endif -#endif -#ifdef PADDLE_MOBILE_CL -template <> -void Executor::InitNoPersistableMemory( - const Tensor &input_tensor) { - DLOG << "CL InitNoPersistableMemory "; - for (const auto &block : program_desc_->Blocks()) { - for (const auto &var_desc : block->Vars()) { - auto var = program_.scope->Var(var_desc->Name()); - - if (var_desc->Persistable()) { - if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { - var->template GetMutable(); - continue; - } - } else { - if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { - auto cl_image = var->template GetMutable(); - cl_context context = program_.scope->GetCLScpoe()->Context(); - cl_command_queue command_queue = - program_.scope->GetCLScpoe()->CommandQueue(); - - DDim tensor_dim = cl_image->dims(); - DDim new_dim = - make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2], - input_tensor.dims()[3]}); - cl_image->Resize(new_dim); - cl_image->InitEmptyImage(context, command_queue, new_dim); - } - } - } - } - std::shared_ptr output = GetOutput("fetch"); - output->Resize(input_tensor.dims()); - output->mutable_data(); -} - -template <> -void Executor::SetInput(const Tensor &input, - const std::string &var_name) { - int index = 0; - if (feed_indices_.find(var_name) != feed_indices_.end()) { - index = feed_indices_.find(var_name)->second; - } - auto *feed_var = program_.scope->Var("feed"); - framework::LoDTensor *input_tensor = - &(feed_var->template GetMutable()->at(index)); - - DLOG << "config_.load_when_predict " << config_.load_when_predict; - DLOG << "target_tensor->IsInitialized() " << input_tensor->IsInitialized(); - DLOG << "target_tensor->dims() " << input_tensor->dims(); - DLOG << "input.dims() " << input.dims(); - DLOG << "input_dim_last_ " << input_dim_last_; - if (config_.load_when_predict) { - if (input_dim_last_ != input.dims()) { - DLOG << "SetInput ---- > resize1"; - input_tensor->Resize(input.dims()); - input_tensor->mutable_data(); - if (config_.memory_optimization_level == NoMemoryOptimization) { - InitNoPersistableMemory(*input_tensor); - } else { - pass::MemoryOptPassCl()(program_desc_.get(), program_.scope.get(), - config_.memory_optimization_level, - input.dims()); - } - } - } else { - DLOG << "SetInput ---- > resize2"; - input_tensor->Resize(input.dims()); - DLOG << "SetInput ---- > ShareDataWith"; - } - input_tensor->ShareDataWith(input); - if (feed_indices_.size() == 1) { - input_dim_has_changed_ = input_dim_last_ != input.dims(); - } - auto &dim = input.dims(); - input_dim_last_ = static_cast(dim); -} - -template -void Executor::LoadMemory(const VarDesc var_desc, float *tensorInput, - char **data) {} - -template <> -void Executor::LoadMemory(const VarDesc var_desc, - float *tensorInput, char **data) { - // 1. version - uint32_t version = *reinterpret_cast(*data); - - (*data) += sizeof(uint32_t); - - // 2 Lod information - uint64_t *lod_level_ptr = new uint64_t(); - memcpy(lod_level_ptr, (*data), sizeof(uint64_t)); - uint64_t lod_level = *lod_level_ptr; - delete lod_level_ptr; - (*data) += sizeof(uint64_t); - - for (uint64_t i = 0; i < lod_level; ++i) { - uint64_t size = *reinterpret_cast(*data); - (*data) += sizeof(uint64_t); - std::vector tmp(size / sizeof(size_t)); - - for (int k = 0; k < tmp.size(); ++k) { - tmp[k] = *reinterpret_cast(*data); - (*data) += sizeof(size_t); - } - } - - // 3. tensor version - uint32_t tensor_version = *reinterpret_cast(*data); - (*data) += sizeof(uint32_t); - - // 4. tensor desc - int32_t size = *reinterpret_cast(*data); - (*data) += sizeof(int32_t); - - std::unique_ptr buf(new char[size]); - for (int m = 0; m < size; ++m) { - buf.get()[m] = (*data)[m]; - } - (*data) += (sizeof(char) * size); - - const TensorDesc &desc = var_desc.Tensor_desc(); - int memory_size = 1; - for (auto l : desc.Dims()) { - memory_size *= l; - } - - void *memory = nullptr; - int type_size = 4; - memory = tensorInput; - - LoadMemInternal(reinterpret_cast(data), - reinterpret_cast(memory), memory_size, - program_.quantification, program_.quantification_fold); -} - -template <> -void Executor::InitMemory() { - for (const auto &block : program_desc_->Blocks()) { - for (const auto &var_desc : block->Vars()) { - auto var = program_.scope->Var(var_desc->Name()); - if (var_desc->Persistable()) { - CLImage *cl_image = nullptr; - if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { - var->template GetMutable(); - continue; - } else { - cl_image = var->template GetMutable(); - } - - char *origin_data = - ReadFileToBuff(program_.model_path + "/" + var_desc->Name()); - char *data = origin_data; - cl_context context = program_.scope->GetCLScpoe()->Context(); - const TensorDesc &desc = var_desc->Tensor_desc(); - int numel = 1; - for (auto l : desc.Dims()) { - numel *= l; - } - DLOG << var_desc->Name(); - float *tensorInput = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * numel)); - LoadMemory(*var_desc, tensorInput, &data); - - DDim ddim = make_ddim(desc.Dims()); - - // has not init - cl_image->SetTensorData(tensorInput, ddim); - - delete origin_data; - paddle_mobile::memory::Free(tensorInput); - } else { - if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { - auto cl_image = var->template GetMutable(); - cl_context context = program_.scope->GetCLScpoe()->Context(); - cl_command_queue command_queue = - program_.scope->GetCLScpoe()->CommandQueue(); - - const TensorDesc &desc = var_desc->Tensor_desc(); - // DDim ddim = make_ddim(desc.Dims()); - DDim ddim = cl_image->dims(); - LOG(kLOG_DEBUG1) << "init image of " << var_desc->Name(); - cl_image->InitEmptyImage(context, command_queue, ddim); - } - } - } - } -} - -template <> -void Executor::InitCombineMemory() { - DLOG << "CL InitCombineMemory---- " - << "config_.load_when_predict: " << config_.load_when_predict; - char *origin_data = nullptr; - bool self_alloc = false; - if (program_.combined_params_buf && program_.combined_params_len) { - LOG(kLOG_INFO) << "use outter memory"; - origin_data = reinterpret_cast(program_.combined_params_buf); - if (config_.model_obfuscate_key != "") { - auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key); - obfuscator.convert_data(origin_data, program_.combined_params_len); - } - } else { - LOG(kLOG_INFO) << " begin init combine memory"; - self_alloc = true; - origin_data = ReadFileToBuff(program_.para_path); - if (config_.model_obfuscate_key != "") { - auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key); - obfuscator.convert_data(origin_data, GetFileLength(program_.para_path)); - } - } - PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!"); - float *data = reinterpret_cast(origin_data); - - for (const auto &block : program_desc_->Blocks()) { - for (const auto &var_desc : block->Vars()) { - auto var = program_.scope->Var(var_desc->Name()); - if (var_desc->Persistable()) { - CLImage *cl_image = nullptr; - if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { - var->template GetMutable(); - continue; - } else { - cl_image = var->template GetMutable(); - } - - cl_context context = program_.scope->GetCLScpoe()->Context(); - - const TensorDesc &desc = var_desc->Tensor_desc(); - DDim ddim = make_ddim(desc.Dims()); - - int numel = 1; - for (int i = 0; i < ddim.size(); i++) { - numel = numel * ddim[i]; - } - float *tensorInput = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * numel)); - LoadMemory(*var_desc, tensorInput, &origin_data); - - // has not init - cl_image->SetTensorData(tensorInput, ddim); - - paddle_mobile::memory::Free(tensorInput); - } else { - auto cl_image = var->template GetMutable(); - cl_context context = program_.scope->GetCLScpoe()->Context(); - cl_command_queue command_queue = - program_.scope->GetCLScpoe()->CommandQueue(); - const TensorDesc &desc = var_desc->Tensor_desc(); - DDim ddim = cl_image->dims(); - bool shouldResize = true; - if (ddim.size() > 4) { - for (int i = 0; i < ddim.size() - 4; ++i) { - if (ddim[i] != 0 && ddim[i] != 1) { - shouldResize = false; - break; - } - } - if (shouldResize) { - std::vector temp_intput_dims; - temp_intput_dims.reserve(static_cast(4)); - for (int i = ddim.size() - 4; i < ddim.size(); ++i) { - temp_intput_dims.push_back(ddim[i]); - } - ddim = framework::make_ddim(temp_intput_dims); - } - } - // DDim ddim = make_ddim(desc.Dims()); - cl_image->InitEmptyImage(context, command_queue, ddim); - } - } - } - if (self_alloc) { - delete data; - } - LOG(kLOG_INFO) << " end init combine memory "; -} - -#endif - -template class Executor; - -template class Executor; - -template class Executor; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/executor.h b/mobile/src/framework/executor.h deleted file mode 100644 index ebb16f697b..0000000000 --- a/mobile/src/framework/executor.h +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "common/types.h" -#include "common/util.h" -#include "framework/lod_tensor.h" -#include "framework/operator.h" -#include "framework/program/program.h" -#include "framework/tensor.h" -#include "framework/type_trait.h" -#include "pass/memory_optimize.h" - -namespace paddle_mobile { -namespace framework { - -template -class Executor { - public: - Executor(const Program &program, - paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1, - const bool use_optimize = true, const bool lod_mode = false); - - void SetThreadNum(int thread_num, - PowerMode power_mode = PERFORMANCE_PRIORITY); - - PMStatus Predict(const std::vector> &inputs); - PMStatus Predict( - const std::vector> &inputs); - - std::vector Predict(const std::vector &input, - const std::vector &dims); - PMStatus Predict(); - - void SetInput(const Tensor &input, const std::string &var_name); - void SetInput(const LoDTensor &input, const std::string &var_name); - - std::shared_ptr GetOutput(const std::string &var_name); -#ifdef PADDLE_MOBILE_CL - const CLImage *GetOutputImage(const std::string &var_name); -#endif - - void FeedTensorData(const std::vector &v); - void GetTensorResults(std::vector *v); - std::string GetExceptionMsg(); - -#ifdef PADDLE_MOBILE_FPGA - void InjectVariable(const Tensor &t, std::string var_name); - void FeedData(const Tensor &t); - void FeedData(const std::vector &v); - void GetResults(std::vector *v); - framework::Tensor *GetTensorByName(const std::string &name); - std::shared_ptr FetchResult(int id = -1); - void Predict_From_To(int start = 0, int end = -1); - void Predict_From(int start); - void Predict_To(int end); -#ifdef PADDLE_MOBILE_FPGA_V2 - void InitQuantMemory(); -#endif -#endif - - protected: - Executor() = default; - - bool varInputMemory(const std::shared_ptr &var_desc, - Variable *var) const; - void InitFeedFetchList(); - void InitMemory(); - void InitCombineMemory(); - void InitNoPersistableMemory(const Tensor &input_tensor); - void LoadMemory(void **data, const std::shared_ptr var_desc, - LoDTensor *tensor); -#ifdef PADDLE_MOBILE_CL - void LoadMemory(const VarDesc var_desc, float *tensorInput, char **data); -#endif - - int batch_size_; - bool use_optimize_; - bool lod_mode_; - PaddleMobileConfigInternal config_; - Program program_; - std::shared_ptr program_desc_; - std::vector>> ops_of_block0_; - std::unordered_map feed_indices_; - std::unordered_map fetch_indices_; - std::string exception_msg_; - - // for super resoltion - DDim input_dim_last_; - bool input_dim_has_changed_ = true; - -#ifdef PADDLE_MOBILE_PROFILE - typedef typename DtypeTensorTrait::gtype ProfileTensorType; - - struct ProfInfo { - int tid = 0; - uint64_t runBegin = 0UL; - uint64_t runEnd = 0UL; - }; - - void PrintProfile(const vector::ProfInfo> &profile) const; -#endif - void ApplyMemoryOptimise(const PaddleMobileConfigInternal &config, - const bool lod_mode) const; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/framework.pb-c.cpp b/mobile/src/framework/framework.pb-c.cpp deleted file mode 100644 index b8d76282ec..0000000000 --- a/mobile/src/framework/framework.pb-c.cpp +++ /dev/null @@ -1,1465 +0,0 @@ -/* Generated by the protocol buffer compiler. DO NOT EDIT! */ -/* Generated from: framework.proto */ - -/* Do not generate deprecated warnings for self */ -#ifndef PROTOBUF_C__NO_DEPRECATED -#define PROTOBUF_C__NO_DEPRECATED -#endif - -#include "framework.pb-c.h" -void paddle_mobile__framework__proto__version__init( - PaddleMobile__Framework__Proto__Version *message) { - static const PaddleMobile__Framework__Proto__Version init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT; - *message = init_value; -} -PaddleMobile__Framework__Proto__Version * -paddle_mobile__framework__proto__version__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data) { - return (PaddleMobile__Framework__Proto__Version *) - PaddleMobile__Framework__protobuf_c_message_unpack( - &paddle_mobile__framework__proto__version__descriptor, allocator, len, - data); -} -void paddle_mobile__framework__proto__version__free_unpacked( - PaddleMobile__Framework__Proto__Version *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - if (!message) return; - assert(message->base.descriptor == - &paddle_mobile__framework__proto__version__descriptor); - PaddleMobile__Framework__protobuf_c_message_free_unpacked( - (PaddleMobile__Framework__ProtobufCMessage *)message, allocator); -} -void paddle_mobile__framework__proto__op_desc__attr__init( - PaddleMobile__Framework__Proto__OpDesc__Attr *message) { - static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__op_desc__var__init( - PaddleMobile__Framework__Proto__OpDesc__Var *message) { - static const PaddleMobile__Framework__Proto__OpDesc__Var init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__op_desc__init( - PaddleMobile__Framework__Proto__OpDesc *message) { - static const PaddleMobile__Framework__Proto__OpDesc init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT; - *message = init_value; -} -PaddleMobile__Framework__Proto__OpDesc * -paddle_mobile__framework__proto__op_desc__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data) { - return (PaddleMobile__Framework__Proto__OpDesc *) - PaddleMobile__Framework__protobuf_c_message_unpack( - &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len, - data); -} -void paddle_mobile__framework__proto__op_desc__free_unpacked( - PaddleMobile__Framework__Proto__OpDesc *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - if (!message) return; - assert(message->base.descriptor == - &paddle_mobile__framework__proto__op_desc__descriptor); - PaddleMobile__Framework__protobuf_c_message_free_unpacked( - (PaddleMobile__Framework__ProtobufCMessage *)message, allocator); -} -void paddle_mobile__framework__proto__op_proto__var__init( - PaddleMobile__Framework__Proto__OpProto__Var *message) { - static const PaddleMobile__Framework__Proto__OpProto__Var init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__op_proto__attr__init( - PaddleMobile__Framework__Proto__OpProto__Attr *message) { - static const PaddleMobile__Framework__Proto__OpProto__Attr init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__op_proto__init( - PaddleMobile__Framework__Proto__OpProto *message) { - static const PaddleMobile__Framework__Proto__OpProto init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT; - *message = init_value; -} -PaddleMobile__Framework__Proto__OpProto * -paddle_mobile__framework__proto__op_proto__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data) { - return (PaddleMobile__Framework__Proto__OpProto *) - PaddleMobile__Framework__protobuf_c_message_unpack( - &paddle_mobile__framework__proto__op_proto__descriptor, allocator, - len, data); -} -void paddle_mobile__framework__proto__op_proto__free_unpacked( - PaddleMobile__Framework__Proto__OpProto *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - if (!message) return; - assert(message->base.descriptor == - &paddle_mobile__framework__proto__op_proto__descriptor); - PaddleMobile__Framework__protobuf_c_message_free_unpacked( - (PaddleMobile__Framework__ProtobufCMessage *)message, allocator); -} -void paddle_mobile__framework__proto__var_type__tensor_desc__init( - PaddleMobile__Framework__Proto__VarType__TensorDesc *message) { - static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init( - PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) { - static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc - init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init( - PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) { - static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc - init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__var_type__reader_desc__init( - PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) { - static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__var_type__channel_desc__init( - PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) { - static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__var_type__tuple__init( - PaddleMobile__Framework__Proto__VarType__Tuple *message) { - static const PaddleMobile__Framework__Proto__VarType__Tuple init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT; - *message = init_value; -} -void paddle_mobile__framework__proto__var_type__init( - PaddleMobile__Framework__Proto__VarType *message) { - static const PaddleMobile__Framework__Proto__VarType init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT; - *message = init_value; -} -PaddleMobile__Framework__Proto__VarType * -paddle_mobile__framework__proto__var_type__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data) { - return (PaddleMobile__Framework__Proto__VarType *) - PaddleMobile__Framework__protobuf_c_message_unpack( - &paddle_mobile__framework__proto__var_type__descriptor, allocator, - len, data); -} -void paddle_mobile__framework__proto__var_type__free_unpacked( - PaddleMobile__Framework__Proto__VarType *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - if (!message) return; - assert(message->base.descriptor == - &paddle_mobile__framework__proto__var_type__descriptor); - PaddleMobile__Framework__protobuf_c_message_free_unpacked( - (PaddleMobile__Framework__ProtobufCMessage *)message, allocator); -} -void paddle_mobile__framework__proto__var_desc__init( - PaddleMobile__Framework__Proto__VarDesc *message) { - static const PaddleMobile__Framework__Proto__VarDesc init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT; - *message = init_value; -} -PaddleMobile__Framework__Proto__VarDesc * -paddle_mobile__framework__proto__var_desc__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data) { - return (PaddleMobile__Framework__Proto__VarDesc *) - PaddleMobile__Framework__protobuf_c_message_unpack( - &paddle_mobile__framework__proto__var_desc__descriptor, allocator, - len, data); -} -void paddle_mobile__framework__proto__var_desc__free_unpacked( - PaddleMobile__Framework__Proto__VarDesc *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - if (!message) return; - assert(message->base.descriptor == - &paddle_mobile__framework__proto__var_desc__descriptor); - PaddleMobile__Framework__protobuf_c_message_free_unpacked( - (PaddleMobile__Framework__ProtobufCMessage *)message, allocator); -} -void paddle_mobile__framework__proto__block_desc__init( - PaddleMobile__Framework__Proto__BlockDesc *message) { - static const PaddleMobile__Framework__Proto__BlockDesc init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT; - *message = init_value; -} -PaddleMobile__Framework__Proto__BlockDesc * -paddle_mobile__framework__proto__block_desc__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data) { - return (PaddleMobile__Framework__Proto__BlockDesc *) - PaddleMobile__Framework__protobuf_c_message_unpack( - &paddle_mobile__framework__proto__block_desc__descriptor, allocator, - len, data); -} -void paddle_mobile__framework__proto__block_desc__free_unpacked( - PaddleMobile__Framework__Proto__BlockDesc *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - if (!message) return; - assert(message->base.descriptor == - &paddle_mobile__framework__proto__block_desc__descriptor); - PaddleMobile__Framework__protobuf_c_message_free_unpacked( - (PaddleMobile__Framework__ProtobufCMessage *)message, allocator); -} -void paddle_mobile__framework__proto__program_desc__init( - PaddleMobile__Framework__Proto__ProgramDesc *message) { - static const PaddleMobile__Framework__Proto__ProgramDesc init_value = - PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT; - *message = init_value; -} -PaddleMobile__Framework__Proto__ProgramDesc * -paddle_mobile__framework__proto__program_desc__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data) { - return (PaddleMobile__Framework__Proto__ProgramDesc *) - PaddleMobile__Framework__protobuf_c_message_unpack( - &paddle_mobile__framework__proto__program_desc__descriptor, allocator, - len, data); -} -void paddle_mobile__framework__proto__program_desc__free_unpacked( - PaddleMobile__Framework__Proto__ProgramDesc *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - if (!message) return; - assert(message->base.descriptor == - &paddle_mobile__framework__proto__program_desc__descriptor); - PaddleMobile__Framework__protobuf_c_message_free_unpacked( - (PaddleMobile__Framework__ProtobufCMessage *)message, allocator); -} -static const int64_t - paddle_mobile__framework__proto__version__version__default_value = 0ll; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__version__field_descriptors[1] = { - { - "version", 1, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64, - offsetof(PaddleMobile__Framework__Proto__Version, has_version), - offsetof(PaddleMobile__Framework__Proto__Version, version), NULL, - &paddle_mobile__framework__proto__version__version__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__version__field_indices_by_name[] = { - 0, /* field[0] = version */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__version__number_ranges[1 + 1] = {{1, 0}, - {0, 1}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__version__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.Version", - "Version", - "PaddleMobile__Framework__Proto__Version", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__Version), - 1, - paddle_mobile__framework__proto__version__field_descriptors, - paddle_mobile__framework__proto__version__field_indices_by_name, - 1, - paddle_mobile__framework__proto__version__number_ranges, - (ProtobufCMessageInit)paddle_mobile__framework__proto__version__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__op_desc__attr__field_descriptors[14] = { - { - "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type), - &paddle_mobile__framework__proto__attr_type__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, - has_block_idx), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "blocks_idx", 14, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, - n_blocks_idx), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, blocks_idx), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "longs", 15, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_longs), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, longs), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = { - 8, /* field[8] = b */ - 10, /* field[10] = block_idx */ - 12, /* field[12] = blocks_idx */ - 9, /* field[9] = bools */ - 3, /* field[3] = f */ - 6, /* field[6] = floats */ - 2, /* field[2] = i */ - 5, /* field[5] = ints */ - 11, /* field[11] = l */ - 13, /* field[13] = longs */ - 0, /* field[0] = name */ - 4, /* field[4] = s */ - 7, /* field[7] = strings */ - 1, /* field[1] = type */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = { - {1, 0}, {10, 8}, {0, 14}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_desc__attr__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.OpDesc.Attr", - "Attr", - "PaddleMobile__Framework__Proto__OpDesc__Attr", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr), - 14, - paddle_mobile__framework__proto__op_desc__attr__field_descriptors, - paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name, - 2, - paddle_mobile__framework__proto__op_desc__attr__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__op_desc__attr__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = { - { - "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING, - offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments), - offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = { - 1, /* field[1] = arguments */ - 0, /* field[0] = parameter */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = { - {1, 0}, {0, 2}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_desc__var__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.OpDesc.Var", - "Var", - "PaddleMobile__Framework__Proto__OpDesc__Var", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__OpDesc__Var), - 2, - paddle_mobile__framework__proto__op_desc__var__field_descriptors, - paddle_mobile__framework__proto__op_desc__var__field_indices_by_name, - 1, - paddle_mobile__framework__proto__op_desc__var__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__op_desc__var__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const protobuf_c_boolean - paddle_mobile__framework__proto__op_desc__is_target__default_value = 0; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__op_desc__field_descriptors[5] = { - { - "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs), - offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs), - &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs), - offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs), - &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs), - offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs), - &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL, - offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target), - offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL, - &paddle_mobile__framework__proto__op_desc__is_target__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = { - 3, /* field[3] = attrs */ - 0, /* field[0] = inputs */ - 4, /* field[4] = is_target */ - 1, /* field[1] = outputs */ - 2, /* field[2] = type */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0}, - {0, 5}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.OpDesc", - "OpDesc", - "PaddleMobile__Framework__Proto__OpDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__OpDesc), - 5, - paddle_mobile__framework__proto__op_desc__field_descriptors, - paddle_mobile__framework__proto__op_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__op_desc__number_ranges, - (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const protobuf_c_boolean - paddle_mobile__framework__proto__op_proto__var__duplicable__default_value = - 0; -static const protobuf_c_boolean - paddle_mobile__framework__proto__op_proto__var__intermediate__default_value = - 0; -static const protobuf_c_boolean - paddle_mobile__framework__proto__op_proto__var__dispensable__default_value = - 0; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__op_proto__var__field_descriptors[6] = { - { - "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL, - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, - has_duplicable), - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable), - NULL, - &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL, - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, - has_intermediate), - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, - intermediate), - NULL, - &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL, - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, - has_dispensable), - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable), - NULL, - &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "reuse", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpProto__Var, reuse), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = { - 1, /* field[1] = comment */ - 4, /* field[4] = dispensable */ - 2, /* field[2] = duplicable */ - 3, /* field[3] = intermediate */ - 0, /* field[0] = name */ - 5, /* field[5] = reuse */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = { - {1, 0}, {0, 6}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_proto__var__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.OpProto.Var", - "Var", - "PaddleMobile__Framework__Proto__OpProto__Var", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__OpProto__Var), - 6, - paddle_mobile__framework__proto__op_proto__var__field_descriptors, - paddle_mobile__framework__proto__op_proto__var__field_indices_by_name, - 1, - paddle_mobile__framework__proto__op_proto__var__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__op_proto__var__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const protobuf_c_boolean - paddle_mobile__framework__proto__op_proto__attr__generated__default_value = - 0; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = { - { - "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type), - &paddle_mobile__framework__proto__attr_type__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL, - offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, - has_generated), - offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated), - NULL, - &paddle_mobile__framework__proto__op_proto__attr__generated__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = { - 2, /* field[2] = comment */ - 3, /* field[3] = generated */ - 0, /* field[0] = name */ - 1, /* field[1] = type */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = { - {1, 0}, {0, 4}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_proto__attr__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.OpProto.Attr", - "Attr", - "PaddleMobile__Framework__Proto__OpProto__Attr", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__OpProto__Attr), - 4, - paddle_mobile__framework__proto__op_proto__attr__field_descriptors, - paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name, - 1, - paddle_mobile__framework__proto__op_proto__attr__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__op_proto__attr__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__op_proto__field_descriptors[5] = { - { - "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs), - offsetof(PaddleMobile__Framework__Proto__OpProto, inputs), - &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs), - offsetof(PaddleMobile__Framework__Proto__OpProto, outputs), - &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs), - offsetof(PaddleMobile__Framework__Proto__OpProto, attrs), - &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = { - 3, /* field[3] = attrs */ - 4, /* field[4] = comment */ - 1, /* field[1] = inputs */ - 2, /* field[2] = outputs */ - 0, /* field[0] = type */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0}, - {0, 5}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_proto__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.OpProto", - "OpProto", - "PaddleMobile__Framework__Proto__OpProto", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__OpProto), - 5, - paddle_mobile__framework__proto__op_proto__field_descriptors, - paddle_mobile__framework__proto__op_proto__field_indices_by_name, - 1, - paddle_mobile__framework__proto__op_proto__number_ranges, - (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors - [2] = { - { - "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc, - data_type), - &paddle_mobile__framework__proto__var_type__type__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64, - offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc, - n_dims), - offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc, - dims), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name - [] = { - 0, /* field[0] = data_type */ - 1, /* field[1] = dims */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 + - 1] = { - {1, 0}, {0, 2}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarType.TensorDesc", - "TensorDesc", - "PaddleMobile__Framework__Proto__VarType__TensorDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc), - 2, - paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors, - paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__var_type__tensor_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const int32_t - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value = - 0; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors - [2] = { - { - "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc, - tensor), - &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL, - PROTOBUF_C_TYPE_INT32, - offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc, - has_lod_level), - offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc, - lod_level), - NULL, - &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name - [] = { - 1, /* field[1] = lod_level */ - 0, /* field[0] = tensor */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges - [1 + 1] = {{1, 0}, {0, 2}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarType.LoDTensorDesc", - "LoDTensorDesc", - "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc), - 2, - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors, - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const int32_t - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value = - 0; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors - [2] = { - { - "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof( - PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc, - tensor), - &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL, - PROTOBUF_C_TYPE_INT32, - offsetof( - PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc, - has_lod_level), - offsetof( - PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc, - lod_level), - NULL, - &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name - [] = { - 1, /* field[1] = lod_level */ - 0, /* field[0] = tensor */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges - [1 + 1] = {{1, 0}, {0, 2}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc", - "LoDTensorArrayDesc", - "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc), - 2, - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors, - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = { - { - "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc, - n_lod_tensor), - offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc, - lod_tensor), - &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name - [] = { - 0, /* field[0] = lod_tensor */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 + - 1] = { - {1, 0}, {0, 1}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__reader_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarType.ReaderDesc", - "ReaderDesc", - "PaddleMobile__Framework__Proto__VarType__ReaderDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc), - 1, - paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors, - paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__var_type__reader_desc__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__var_type__reader_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors - [2] = { - { - "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc, - data_type), - &paddle_mobile__framework__proto__var_type__type__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc, - capacity), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name - [] = { - 1, /* field[1] = capacity */ - 0, /* field[0] = data_type */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 + - 1] = - {{1, 0}, {0, 2}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__channel_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarType.ChannelDesc", - "ChannelDesc", - "PaddleMobile__Framework__Proto__VarType__ChannelDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc), - 2, - paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors, - paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__var_type__channel_desc__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__var_type__channel_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = { - { - "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM, - offsetof(PaddleMobile__Framework__Proto__VarType__Tuple, - n_element_type), - offsetof(PaddleMobile__Framework__Proto__VarType__Tuple, - element_type), - &paddle_mobile__framework__proto__var_type__type__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] = - { - 0, /* field[0] = element_type */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = { - {1, 0}, {0, 1}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__tuple__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarType.Tuple", - "Tuple", - "PaddleMobile__Framework__Proto__VarType__Tuple", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__VarType__Tuple), - 1, - paddle_mobile__framework__proto__var_type__tuple__field_descriptors, - paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name, - 1, - paddle_mobile__framework__proto__var_type__tuple__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__var_type__tuple__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCEnumValue - paddle_mobile__framework__proto__var_type__type__enum_values_by_number[22] = - { - {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL", - 0}, - {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16", - 1}, - {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32", - 2}, - {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64", - 3}, - {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16", - 4}, - {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32", - 5}, - {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64", - 6}, - {"LOD_TENSOR", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7}, - {"SELECTED_ROWS", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS", - 8}, - {"FEED_MINIBATCH", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH", - 9}, - {"FETCH_LIST", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10}, - {"STEP_SCOPES", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES", - 11}, - {"LOD_RANK_TABLE", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE", - 12}, - {"LOD_TENSOR_ARRAY", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_" - "ARRAY", - 13}, - {"PLACE_LIST", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14}, - {"READER", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15}, - {"CHANNEL", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16}, - {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17}, - {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE", - 18}, - {"SIZE_T", - "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T", 19}, - {"UINT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8", - 20}, - {"INT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8", - 21}, -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0}, - {0, 22}}; -static const PaddleMobile__Framework__ProtobufCEnumValueIndex - paddle_mobile__framework__proto__var_type__type__enum_values_by_name[22] = { - {"BOOL", 0}, {"CHANNEL", 16}, - {"FEED_MINIBATCH", 9}, {"FETCH_LIST", 10}, - {"FP16", 4}, {"FP32", 5}, - {"FP64", 6}, {"INT16", 1}, - {"INT32", 2}, {"INT64", 3}, - {"INT8", 21}, {"LOD_RANK_TABLE", 12}, - {"LOD_TENSOR", 7}, {"LOD_TENSOR_ARRAY", 13}, - {"PLACE_LIST", 14}, {"RAW", 17}, - {"READER", 15}, {"SELECTED_ROWS", 8}, - {"SIZE_T", 19}, {"STEP_SCOPES", 11}, - {"TUPLE", 18}, {"UINT8", 20}, -}; -const PaddleMobile__Framework__ProtobufCEnumDescriptor - paddle_mobile__framework__proto__var_type__type__descriptor = { - PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarType.Type", - "Type", - "PaddleMobile__Framework__Proto__VarType__Type", - "paddle_mobile.framework.proto", - 22, - paddle_mobile__framework__proto__var_type__type__enum_values_by_number, - 22, - paddle_mobile__framework__proto__var_type__type__enum_values_by_name, - 1, - paddle_mobile__framework__proto__var_type__type__value_ranges, - NULL, - NULL, - NULL, - NULL /* reserved[1234] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__var_type__field_descriptors[7] = { - { - "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType, type), - &paddle_mobile__framework__proto__var_type__type__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL, - PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows), - &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor), - &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL, - PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array), - &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType, reader), - &paddle_mobile__framework__proto__var_type__reader_desc__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType, channel), - &paddle_mobile__framework__proto__var_type__channel_desc__descriptor, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarType, tuple), - &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__var_type__field_indices_by_name[] = { - 5, /* field[5] = channel */ - 2, /* field[2] = lod_tensor */ - 4, /* field[4] = reader */ - 1, /* field[1] = selected_rows */ - 3, /* field[3] = tensor_array */ - 6, /* field[6] = tuple */ - 0, /* field[0] = type */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0}, - {0, 7}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarType", - "VarType", - "PaddleMobile__Framework__Proto__VarType", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__VarType), - 7, - paddle_mobile__framework__proto__var_type__field_descriptors, - paddle_mobile__framework__proto__var_type__field_indices_by_name, - 1, - paddle_mobile__framework__proto__var_type__number_ranges, - (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const protobuf_c_boolean - paddle_mobile__framework__proto__var_desc__persistable__default_value = 0; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__var_desc__field_descriptors[3] = { - { - "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__VarDesc, type), - &paddle_mobile__framework__proto__var_type__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL, - offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable), - offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable), - NULL, - &paddle_mobile__framework__proto__var_desc__persistable__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = { - 0, /* field[0] = name */ - 2, /* field[2] = persistable */ - 1, /* field[1] = type */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0}, - {0, 3}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.VarDesc", - "VarDesc", - "PaddleMobile__Framework__Proto__VarDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__VarDesc), - 3, - paddle_mobile__framework__proto__var_desc__field_descriptors, - paddle_mobile__framework__proto__var_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__var_desc__number_ranges, - (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const int32_t - paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value = - -1; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__block_desc__field_descriptors[5] = { - { - "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars), - offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars), - &paddle_mobile__framework__proto__var_desc__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops), - offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops), - &paddle_mobile__framework__proto__op_desc__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL, - PROTOBUF_C_TYPE_INT32, - offsetof(PaddleMobile__Framework__Proto__BlockDesc, - has_forward_block_idx), - offsetof(PaddleMobile__Framework__Proto__BlockDesc, - forward_block_idx), - NULL, - &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = { - 4, /* field[4] = forward_block_idx */ - 0, /* field[0] = idx */ - 3, /* field[3] = ops */ - 1, /* field[1] = parent_idx */ - 2, /* field[2] = vars */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = { - {1, 0}, {0, 5}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__block_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.BlockDesc", - "BlockDesc", - "PaddleMobile__Framework__Proto__BlockDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__BlockDesc), - 5, - paddle_mobile__framework__proto__block_desc__field_descriptors, - paddle_mobile__framework__proto__block_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__block_desc__number_ranges, - (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCFieldDescriptor - paddle_mobile__framework__proto__program_desc__field_descriptors[2] = { - { - "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks), - offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks), - &paddle_mobile__framework__proto__block_desc__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "version", 2, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE, - 0, /* quantifier_offset */ - offsetof(PaddleMobile__Framework__Proto__ProgramDesc, version), - &paddle_mobile__framework__proto__version__descriptor, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned - paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = { - 0, /* field[0] = blocks */ - 1, /* field[1] = version */ -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = { - {1, 0}, {0, 2}}; -const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__program_desc__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.ProgramDesc", - "ProgramDesc", - "PaddleMobile__Framework__Proto__ProgramDesc", - "paddle_mobile.framework.proto", - sizeof(PaddleMobile__Framework__Proto__ProgramDesc), - 2, - paddle_mobile__framework__proto__program_desc__field_descriptors, - paddle_mobile__framework__proto__program_desc__field_indices_by_name, - 1, - paddle_mobile__framework__proto__program_desc__number_ranges, - (ProtobufCMessageInit) - paddle_mobile__framework__proto__program_desc__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const PaddleMobile__Framework__ProtobufCEnumValue - paddle_mobile__framework__proto__attr_type__enum_values_by_number[12] = { - {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0}, - {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1}, - {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2}, - {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3}, - {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4}, - {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5}, - {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6}, - {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7}, - {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8}, - {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9}, - {"BLOCKS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS", 10}, - {"LONGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS", 11}, -}; -static const PaddleMobile__Framework__ProtobufCIntRange - paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0}, - {0, 12}}; -static const PaddleMobile__Framework__ProtobufCEnumValueIndex - paddle_mobile__framework__proto__attr_type__enum_values_by_name[12] = { - {"BLOCK", 8}, {"BLOCKS", 10}, {"BOOLEAN", 6}, {"BOOLEANS", 7}, - {"FLOAT", 1}, {"FLOATS", 4}, {"INT", 0}, {"INTS", 3}, - {"LONG", 9}, {"LONGS", 11}, {"STRING", 2}, {"STRINGS", 5}, -}; -const PaddleMobile__Framework__ProtobufCEnumDescriptor - paddle_mobile__framework__proto__attr_type__descriptor = { - PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC, - "paddle_mobile.framework.proto.AttrType", - "AttrType", - "PaddleMobile__Framework__Proto__AttrType", - "paddle_mobile.framework.proto", - 12, - paddle_mobile__framework__proto__attr_type__enum_values_by_number, - 12, - paddle_mobile__framework__proto__attr_type__enum_values_by_name, - 1, - paddle_mobile__framework__proto__attr_type__value_ranges, - NULL, - NULL, - NULL, - NULL /* reserved[1234] */ -}; diff --git a/mobile/src/framework/framework.pb-c.h b/mobile/src/framework/framework.pb-c.h deleted file mode 100644 index 910963f1e6..0000000000 --- a/mobile/src/framework/framework.pb-c.h +++ /dev/null @@ -1,615 +0,0 @@ -/* Generated by the protocol buffer compiler. DO NOT EDIT! */ -/* Generated from: framework.proto */ - -#ifndef PROTOBUF_C_framework_2eproto__INCLUDED -#define PROTOBUF_C_framework_2eproto__INCLUDED - -#include - -PROTOBUF_C__BEGIN_DECLS - -#if PROTOBUF_C_VERSION_NUMBER < 1000000 -# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers. -#elif 1003001 < PROTOBUF_C_MIN_COMPILER_VERSION -# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c. -#endif - -typedef struct _PaddleMobile__Framework__Proto__Version - PaddleMobile__Framework__Proto__Version; -typedef struct _PaddleMobile__Framework__Proto__OpDesc - PaddleMobile__Framework__Proto__OpDesc; -typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr - PaddleMobile__Framework__Proto__OpDesc__Attr; -typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var - PaddleMobile__Framework__Proto__OpDesc__Var; -typedef struct _PaddleMobile__Framework__Proto__OpProto - PaddleMobile__Framework__Proto__OpProto; -typedef struct _PaddleMobile__Framework__Proto__OpProto__Var - PaddleMobile__Framework__Proto__OpProto__Var; -typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr - PaddleMobile__Framework__Proto__OpProto__Attr; -typedef struct _PaddleMobile__Framework__Proto__VarType - PaddleMobile__Framework__Proto__VarType; -typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc - PaddleMobile__Framework__Proto__VarType__TensorDesc; -typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc - PaddleMobile__Framework__Proto__VarType__LoDTensorDesc; -typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc - PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc; -typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc - PaddleMobile__Framework__Proto__VarType__ReaderDesc; -typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc - PaddleMobile__Framework__Proto__VarType__ChannelDesc; -typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple - PaddleMobile__Framework__Proto__VarType__Tuple; -typedef struct _PaddleMobile__Framework__Proto__VarDesc - PaddleMobile__Framework__Proto__VarDesc; -typedef struct _PaddleMobile__Framework__Proto__BlockDesc - PaddleMobile__Framework__Proto__BlockDesc; -typedef struct _PaddleMobile__Framework__Proto__ProgramDesc - PaddleMobile__Framework__Proto__ProgramDesc; - -/* --- enums --- */ - -typedef enum _PaddleMobile__Framework__Proto__VarType__Type { - /* - * Pod Types - */ - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6, - /* - * Tensor is used in C++. - */ - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T = 19, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8 = 20, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8 = 21, - /* - * Other types that may need additional descriptions - */ - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16, - /* - * Any runtime decided variable type is raw - * raw variables should manage their own allocations - * in operators like nccl_op - */ - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17, - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE = - 18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE( - PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE) -} PaddleMobile__Framework__Proto__VarType__Type; -typedef enum _PaddleMobile__Framework__Proto__AttrType { - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS = 10, - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS = - 11 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE( - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE) -} PaddleMobile__Framework__Proto__AttrType; - -/* --- messages --- */ - -/* - * Any incompatible changes to ProgramDesc and its dependencies should - * raise the version defined version.h. - * Serailization and Deserialization codes should be modified in a way - * that supports old versions following the version and compatibility policy. - */ -struct _PaddleMobile__Framework__Proto__Version { - PaddleMobile__Framework__ProtobufCMessage base; - protobuf_c_boolean has_version; - int64_t version; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__version__descriptor) \ - , 0, 0ll \ - } - -struct _PaddleMobile__Framework__Proto__OpDesc__Attr { - PaddleMobile__Framework__ProtobufCMessage base; - char *name; - PaddleMobile__Framework__Proto__AttrType type; - protobuf_c_boolean has_i; - int32_t i; - protobuf_c_boolean has_f; - float f; - char *s; - size_t n_ints; - int32_t *ints; - size_t n_floats; - float *floats; - size_t n_strings; - char **strings; - protobuf_c_boolean has_b; - protobuf_c_boolean b; - size_t n_bools; - protobuf_c_boolean *bools; - protobuf_c_boolean has_block_idx; - int32_t block_idx; - protobuf_c_boolean has_l; - int64_t l; - size_t n_blocks_idx; - int32_t *blocks_idx; - size_t n_longs; - int64_t *longs; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__op_desc__attr__descriptor) \ - , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \ - 0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL, 0, NULL \ - } - -struct _PaddleMobile__Framework__Proto__OpDesc__Var { - PaddleMobile__Framework__ProtobufCMessage base; - char *parameter; - size_t n_arguments; - char **arguments; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__op_desc__var__descriptor) \ - , NULL, 0, NULL \ - } - -/* - * OpDesc describes an instance of a C++ framework::OperatorBase - * derived class type. - */ -struct _PaddleMobile__Framework__Proto__OpDesc { - PaddleMobile__Framework__ProtobufCMessage base; - char *type; - size_t n_inputs; - PaddleMobile__Framework__Proto__OpDesc__Var **inputs; - size_t n_outputs; - PaddleMobile__Framework__Proto__OpDesc__Var **outputs; - size_t n_attrs; - PaddleMobile__Framework__Proto__OpDesc__Attr **attrs; - protobuf_c_boolean has_is_target; - protobuf_c_boolean is_target; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__op_desc__descriptor) \ - , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0 \ - } - -/* - * VarProto describes the C++ type framework::Variable. - */ -struct _PaddleMobile__Framework__Proto__OpProto__Var { - PaddleMobile__Framework__ProtobufCMessage base; - char *name; - char *comment; - protobuf_c_boolean has_duplicable; - protobuf_c_boolean duplicable; - protobuf_c_boolean has_intermediate; - protobuf_c_boolean intermediate; - protobuf_c_boolean has_dispensable; - protobuf_c_boolean dispensable; - char *reuse; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__op_proto__var__descriptor) \ - , NULL, NULL, 0, 0, 0, 0, 0, 0, NULL \ - } - -/* - * AttrProto describes the C++ type Attribute. - */ -struct _PaddleMobile__Framework__Proto__OpProto__Attr { - PaddleMobile__Framework__ProtobufCMessage base; - char *name; - PaddleMobile__Framework__Proto__AttrType type; - char *comment; - /* - * If that attribute is generated, it means the Paddle third - * language binding has responsibility to fill that - * attribute. End-User should not set that attribute. - */ - protobuf_c_boolean has_generated; - protobuf_c_boolean generated; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__op_proto__attr__descriptor) \ - , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \ - } - -/* - * OpProto describes a C++ framework::OperatorBase derived class. - */ -struct _PaddleMobile__Framework__Proto__OpProto { - PaddleMobile__Framework__ProtobufCMessage base; - char *type; - size_t n_inputs; - PaddleMobile__Framework__Proto__OpProto__Var **inputs; - size_t n_outputs; - PaddleMobile__Framework__Proto__OpProto__Var **outputs; - size_t n_attrs; - PaddleMobile__Framework__Proto__OpProto__Attr **attrs; - char *comment; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__op_proto__descriptor) \ - , NULL, 0, NULL, 0, NULL, 0, NULL, NULL \ - } - -struct _PaddleMobile__Framework__Proto__VarType__TensorDesc { - PaddleMobile__Framework__ProtobufCMessage base; - /* - * Should only be PODType. Is enforced in C++ - */ - PaddleMobile__Framework__Proto__VarType__Type data_type; - /* - * [UNK, 640, 480] is saved as [-1, 640, 480] - */ - size_t n_dims; - int64_t *dims; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \ - , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL \ - } - -struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc { - PaddleMobile__Framework__ProtobufCMessage base; - PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor; - protobuf_c_boolean has_lod_level; - int32_t lod_level; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \ - , NULL, 0, 0 \ - } - -struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc { - PaddleMobile__Framework__ProtobufCMessage base; - PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor; - protobuf_c_boolean has_lod_level; - int32_t lod_level; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \ - , NULL, 0, 0 \ - } - -struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc { - PaddleMobile__Framework__ProtobufCMessage base; - size_t n_lod_tensor; - PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \ - , 0, NULL \ - } - -struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc { - PaddleMobile__Framework__ProtobufCMessage base; - PaddleMobile__Framework__Proto__VarType__Type data_type; - int64_t capacity; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \ - , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0 \ - } - -struct _PaddleMobile__Framework__Proto__VarType__Tuple { - PaddleMobile__Framework__ProtobufCMessage base; - size_t n_element_type; - PaddleMobile__Framework__Proto__VarType__Type *element_type; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__var_type__tuple__descriptor) \ - , 0, NULL \ - } - -struct _PaddleMobile__Framework__Proto__VarType { - PaddleMobile__Framework__ProtobufCMessage base; - PaddleMobile__Framework__Proto__VarType__Type type; - PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows; - PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor; - PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array; - PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader; - PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel; - PaddleMobile__Framework__Proto__VarType__Tuple *tuple; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__var_type__descriptor) \ - , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \ - NULL, NULL, NULL \ - } - -struct _PaddleMobile__Framework__Proto__VarDesc { - PaddleMobile__Framework__ProtobufCMessage base; - char *name; - PaddleMobile__Framework__Proto__VarType *type; - protobuf_c_boolean has_persistable; - protobuf_c_boolean persistable; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__var_desc__descriptor) \ - , NULL, NULL, 0, 0 \ - } - -struct _PaddleMobile__Framework__Proto__BlockDesc { - PaddleMobile__Framework__ProtobufCMessage base; - int32_t idx; - int32_t parent_idx; - size_t n_vars; - PaddleMobile__Framework__Proto__VarDesc **vars; - size_t n_ops; - PaddleMobile__Framework__Proto__OpDesc **ops; - protobuf_c_boolean has_forward_block_idx; - int32_t forward_block_idx; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__block_desc__descriptor) \ - , 0, 0, 0, NULL, 0, NULL, 0, -1 \ - } - -/* - * Please refer to - * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md - * for more details. - * TODO(panyx0718): A model can have multiple programs. Need a - * way to distinguish them. Maybe ID or name? - */ -struct _PaddleMobile__Framework__Proto__ProgramDesc { - PaddleMobile__Framework__ProtobufCMessage base; - size_t n_blocks; - PaddleMobile__Framework__Proto__BlockDesc **blocks; - PaddleMobile__Framework__Proto__Version *version; -}; -#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT( \ - &paddle_mobile__framework__proto__program_desc__descriptor) \ - , 0, NULL, NULL \ - } - -/* PaddleMobile__Framework__Proto__Version methods */ -void paddle_mobile__framework__proto__version__init( - PaddleMobile__Framework__Proto__Version *message); -PaddleMobile__Framework__Proto__Version * -paddle_mobile__framework__proto__version__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data); -void paddle_mobile__framework__proto__version__free_unpacked( - PaddleMobile__Framework__Proto__Version *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); -/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */ -void paddle_mobile__framework__proto__op_desc__attr__init( - PaddleMobile__Framework__Proto__OpDesc__Attr *message); -/* PaddleMobile__Framework__Proto__OpDesc__Var methods */ -void paddle_mobile__framework__proto__op_desc__var__init( - PaddleMobile__Framework__Proto__OpDesc__Var *message); -/* PaddleMobile__Framework__Proto__OpDesc methods */ -void paddle_mobile__framework__proto__op_desc__init( - PaddleMobile__Framework__Proto__OpDesc *message); -PaddleMobile__Framework__Proto__OpDesc * -paddle_mobile__framework__proto__op_desc__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data); -void paddle_mobile__framework__proto__op_desc__free_unpacked( - PaddleMobile__Framework__Proto__OpDesc *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); -/* PaddleMobile__Framework__Proto__OpProto__Var methods */ -void paddle_mobile__framework__proto__op_proto__var__init( - PaddleMobile__Framework__Proto__OpProto__Var *message); -/* PaddleMobile__Framework__Proto__OpProto__Attr methods */ -void paddle_mobile__framework__proto__op_proto__attr__init( - PaddleMobile__Framework__Proto__OpProto__Attr *message); -/* PaddleMobile__Framework__Proto__OpProto methods */ -void paddle_mobile__framework__proto__op_proto__init( - PaddleMobile__Framework__Proto__OpProto *message); -PaddleMobile__Framework__Proto__OpProto * -paddle_mobile__framework__proto__op_proto__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data); -void paddle_mobile__framework__proto__op_proto__free_unpacked( - PaddleMobile__Framework__Proto__OpProto *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); -/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */ -void paddle_mobile__framework__proto__var_type__tensor_desc__init( - PaddleMobile__Framework__Proto__VarType__TensorDesc *message); -/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */ -void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init( - PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message); -/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */ -void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init( - PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message); -/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */ -void paddle_mobile__framework__proto__var_type__reader_desc__init( - PaddleMobile__Framework__Proto__VarType__ReaderDesc *message); -/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */ -void paddle_mobile__framework__proto__var_type__channel_desc__init( - PaddleMobile__Framework__Proto__VarType__ChannelDesc *message); -/* PaddleMobile__Framework__Proto__VarType__Tuple methods */ -void paddle_mobile__framework__proto__var_type__tuple__init( - PaddleMobile__Framework__Proto__VarType__Tuple *message); -/* PaddleMobile__Framework__Proto__VarType methods */ -void paddle_mobile__framework__proto__var_type__init( - PaddleMobile__Framework__Proto__VarType *message); -PaddleMobile__Framework__Proto__VarType * -paddle_mobile__framework__proto__var_type__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data); -void paddle_mobile__framework__proto__var_type__free_unpacked( - PaddleMobile__Framework__Proto__VarType *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); -/* PaddleMobile__Framework__Proto__VarDesc methods */ -void paddle_mobile__framework__proto__var_desc__init( - PaddleMobile__Framework__Proto__VarDesc *message); -PaddleMobile__Framework__Proto__VarDesc * -paddle_mobile__framework__proto__var_desc__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data); -void paddle_mobile__framework__proto__var_desc__free_unpacked( - PaddleMobile__Framework__Proto__VarDesc *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); -/* PaddleMobile__Framework__Proto__BlockDesc methods */ -void paddle_mobile__framework__proto__block_desc__init( - PaddleMobile__Framework__Proto__BlockDesc *message); -PaddleMobile__Framework__Proto__BlockDesc * -paddle_mobile__framework__proto__block_desc__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data); -void paddle_mobile__framework__proto__block_desc__free_unpacked( - PaddleMobile__Framework__Proto__BlockDesc *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); -/* PaddleMobile__Framework__Proto__ProgramDesc methods */ -void paddle_mobile__framework__proto__program_desc__init( - PaddleMobile__Framework__Proto__ProgramDesc *message); -PaddleMobile__Framework__Proto__ProgramDesc * -paddle_mobile__framework__proto__program_desc__unpack( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data); -void paddle_mobile__framework__proto__program_desc__free_unpacked( - PaddleMobile__Framework__Proto__ProgramDesc *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); -/* --- per-message closures --- */ - -typedef void (*PaddleMobile__Framework__Proto__Version_Closure)( - const PaddleMobile__Framework__Proto__Version *message, void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)( - const PaddleMobile__Framework__Proto__OpDesc__Attr *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)( - const PaddleMobile__Framework__Proto__OpDesc__Var *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)( - const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)( - const PaddleMobile__Framework__Proto__OpProto__Var *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)( - const PaddleMobile__Framework__Proto__OpProto__Attr *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)( - const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)( - const PaddleMobile__Framework__Proto__VarType__TensorDesc *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)( - const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message, - void *closure_data); -typedef void ( - *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)( - const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)( - const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)( - const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)( - const PaddleMobile__Framework__Proto__VarType__Tuple *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)( - const PaddleMobile__Framework__Proto__VarType *message, void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)( - const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)( - const PaddleMobile__Framework__Proto__BlockDesc *message, - void *closure_data); -typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)( - const PaddleMobile__Framework__Proto__ProgramDesc *message, - void *closure_data); - -/* --- services --- */ - -/* --- descriptors --- */ - -extern const PaddleMobile__Framework__ProtobufCEnumDescriptor - paddle_mobile__framework__proto__attr_type__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__version__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_desc__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_desc__attr__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_desc__var__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_proto__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_proto__var__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__op_proto__attr__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__tensor_desc__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__reader_desc__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__channel_desc__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_type__tuple__descriptor; -extern const PaddleMobile__Framework__ProtobufCEnumDescriptor - paddle_mobile__framework__proto__var_type__type__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__var_desc__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__block_desc__descriptor; -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor - paddle_mobile__framework__proto__program_desc__descriptor; - -PROTOBUF_C__END_DECLS - -#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */ diff --git a/mobile/src/framework/framework.proto b/mobile/src/framework/framework.proto deleted file mode 100644 index 27a98e0d61..0000000000 --- a/mobile/src/framework/framework.proto +++ /dev/null @@ -1,196 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto2"; -option optimize_for = LITE_RUNTIME; -package paddle_mobile.framework.proto; - -// Any incompatible changes to ProgramDesc and its dependencies should -// raise the version defined version.h. -// -// Serailization and Deserialization codes should be modified in a way -// that supports old versions following the version and compatibility policy. -message Version { optional int64 version = 1 [ default = 0 ]; } - -enum AttrType { - INT = 0; - FLOAT = 1; - STRING = 2; - INTS = 3; - FLOATS = 4; - STRINGS = 5; - BOOLEAN = 6; - BOOLEANS = 7; - BLOCK = 8; - LONG = 9; - BLOCKS = 10; - LONGS = 11; -} - -// OpDesc describes an instance of a C++ framework::OperatorBase -// derived class type. -message OpDesc { - - message Attr { - required string name = 1; - required AttrType type = 2; - optional int32 i = 3; - optional float f = 4; - optional string s = 5; - repeated int32 ints = 6; - repeated float floats = 7; - repeated string strings = 8; - optional bool b = 10; - repeated bool bools = 11; - optional int32 block_idx = 12; - optional int64 l = 13; - repeated int32 blocks_idx = 14; - repeated int64 longs = 15; - }; - - message Var { - required string parameter = 1; - repeated string arguments = 2; - }; - - required string type = 3; - repeated Var inputs = 1; - repeated Var outputs = 2; - repeated Attr attrs = 4; - optional bool is_target = 5 [ default = false ]; -}; - -// OpProto describes a C++ framework::OperatorBase derived class. -message OpProto { - - // VarProto describes the C++ type framework::Variable. - message Var { - required string name = 1; - required string comment = 2; - - optional bool duplicable = 3 [ default = false ]; - optional bool intermediate = 4 [ default = false ]; - optional bool dispensable = 5 [ default = false ]; - optional string reuse = 6; - } - - // AttrProto describes the C++ type Attribute. - message Attr { - required string name = 1; - required AttrType type = 2; - required string comment = 3; - // If that attribute is generated, it means the Paddle third - // language binding has responsibility to fill that - // attribute. End-User should not set that attribute. - optional bool generated = 4 [ default = false ]; - } - - required string type = 1; - repeated Var inputs = 2; - repeated Var outputs = 3; - repeated Attr attrs = 4; - required string comment = 5; -} - -message VarType { - enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - // Tensor is used in C++. - SIZE_T = 19; - UINT8 = 20; - INT8 = 21; - - // Other types that may need additional descriptions - LOD_TENSOR = 7; - SELECTED_ROWS = 8; - FEED_MINIBATCH = 9; - FETCH_LIST = 10; - STEP_SCOPES = 11; - LOD_RANK_TABLE = 12; - LOD_TENSOR_ARRAY = 13; - PLACE_LIST = 14; - READER = 15; - CHANNEL = 16; - // Any runtime decided variable type is raw - // raw variables should manage their own allocations - // in operators like nccl_op - RAW = 17; - TUPLE = 18; - } - - required Type type = 1; - - message TensorDesc { - // Should only be PODType. Is enforced in C++ - required Type data_type = 1; - repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] - } - optional TensorDesc selected_rows = 2; - - message LoDTensorDesc { - required TensorDesc tensor = 1; - optional int32 lod_level = 2 [ default = 0 ]; - } - optional LoDTensorDesc lod_tensor = 3; - - message LoDTensorArrayDesc { - required TensorDesc tensor = 1; - optional int32 lod_level = 2 [ default = 0 ]; - } - optional LoDTensorArrayDesc tensor_array = 4; - - message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } - optional ReaderDesc reader = 5; - - message ChannelDesc { - required Type data_type = 1; - required int64 capacity = 2; - } - optional ChannelDesc channel = 6; - - message Tuple { repeated Type element_type = 1; } - optional Tuple tuple = 7; -} - -message VarDesc { - required string name = 1; - required VarType type = 2; - optional bool persistable = 3 [ default = false ]; -} - -message BlockDesc { - required int32 idx = 1; - required int32 parent_idx = 2; - repeated VarDesc vars = 3; - repeated OpDesc ops = 4; - optional int32 forward_block_idx = 5 [ default = -1 ]; -} - -// Please refer to -// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md -// for more details. -// TODO(panyx0718): A model can have multiple programs. Need a -// way to distinguish them. Maybe ID or name? -message ProgramDesc { - repeated BlockDesc blocks = 1; - - optional Version version = 2; -} diff --git a/mobile/src/framework/load_ops.h b/mobile/src/framework/load_ops.h deleted file mode 100755 index e04db5d1e8..0000000000 --- a/mobile/src/framework/load_ops.h +++ /dev/null @@ -1,388 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef PADDLE_MOBILE_CPU -#define LOAD_CPU_OP(op_type) \ - extern int TouchOpRegistrar_##op_type##_##cpu(); \ - static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \ - TouchOpRegistrar_##op_type##_##cpu() -#else -#define LOAD_CPU_OP(op_type) -#endif - -#ifdef PADDLE_MOBILE_CL -#define LOAD_GPU_CL_OP(op_type) \ - extern int TouchOpRegistrar_##op_type##_##cl(); \ - static int use_op_itself_##op_type##_##cl __attribute__((unused)) = \ - TouchOpRegistrar_##op_type##_##cl() -#else -#define LOAD_GPU_CL_OP(op_type) -#endif - -#ifdef PADDLE_MOBILE_FPGA -#define LOAD_FPGA_OP(op_type) \ - extern int TouchOpRegistrar_##op_type##_##fpga(); \ - static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \ - TouchOpRegistrar_##op_type##_##fpga() -#else -#define LOAD_FPGA_OP(op_type) -#endif - -#define LOAD_FUSION_MATCHER(op_type) \ - extern int TouchFusionMatcherRegistrar_##op_type(); \ - static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \ - TouchFusionMatcherRegistrar_##op_type(); - -#define LOAD_OP(op_type) \ - LOAD_CPU_OP(op_type); \ - LOAD_GPU_CL_OP(op_type); \ - LOAD_FPGA_OP(op_type); - -#define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type); - -#define LOAD_OP2(op_type, device_type1, device_type2) \ - LOAD_OP1(op_type, device_type1) \ - LOAD_OP1(op_type, device_type2) - -#define LOAD_OP3(op_type, device_type1, device_type2, device_type3) \ - LOAD_OP2(op_type, device_type1, device_type2) \ - LOAD_OP1(op_type, device_type3) - -// load requared ops -LOAD_OP(feed) -LOAD_OP(fetch) -#ifdef FILL_CONSTANT_OP -LOAD_OP2(fill_constant, CPU, FPGA) -#endif -#ifdef BATCHNORM_OP -LOAD_OP2(batch_norm, CPU, GPU_CL); -#endif -#ifdef INSTANCENORM_OP -LOAD_OP1(instance_norm, GPU_CL); -#endif -#ifdef BILINEAR_INTERP_OP -LOAD_OP1(bilinear_interp, CPU); -#endif -#ifdef NEAREST_INTERP_OP -LOAD_OP1(nearest_interp, CPU); -#endif -#ifdef LEAKY_RELU_OP -LOAD_OP1(leaky_relu, CPU); -#endif -#ifdef BOXCODER_OP -LOAD_OP2(box_coder, CPU, GPU_CL); -#endif -#ifdef CONCAT_OP -LOAD_OP3(concat, CPU, GPU_CL, FPGA); -#endif -#ifdef CONV_OP -LOAD_OP3(conv2d, CPU, GPU_CL, FPGA); -#endif -#ifdef LRN_OP -LOAD_OP2(lrn, CPU, GPU_CL); -#endif -#ifdef SIGMOID_OP -LOAD_OP1(sigmoid, CPU); -#endif -#ifdef FUSION_FC_RELU_OP -LOAD_OP2(fusion_fc_relu, CPU, FPGA); -LOAD_FUSION_MATCHER(fusion_fc_relu); -#endif -#ifdef FUSION_ELEMENTWISEADDRELU_OP -LOAD_OP2(fusion_elementwise_add_relu, CPU, FPGA); -LOAD_FUSION_MATCHER(fusion_elementwise_add_relu); -#endif -#ifdef SPLIT_OP -LOAD_OP2(split, CPU, GPU_CL); -#endif -#ifdef RESIZE_OP -LOAD_OP1(resize, CPU); -#endif -#ifdef FUSION_CONVADDBNRELU_OP -LOAD_OP3(fusion_conv_add_bn_relu, CPU, GPU_CL, FPGA); -LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu); -#endif -#ifdef RESHAPE_OP -LOAD_OP2(reshape, CPU, GPU_CL); -#endif -#ifdef RESHAPE2_OP -LOAD_OP2(reshape2, CPU, GPU_CL); -#endif -#ifdef TRANSPOSE_OP -LOAD_OP2(transpose, CPU, GPU_CL); -#endif -#ifdef TRANSPOSE2_OP -LOAD_OP2(transpose2, CPU, GPU_CL); -#endif -#ifdef PRIORBOX_OP -LOAD_OP2(prior_box, CPU, GPU_CL); -#endif -#ifdef DENSITY_PRIORBOX_OP -LOAD_OP2(density_prior_box, CPU, GPU_CL); -#endif -#ifdef FUSION_CONVADDRELU_OP -LOAD_OP3(fusion_conv_add_relu, CPU, GPU_CL, FPGA); -LOAD_FUSION_MATCHER(fusion_conv_add_relu); -#endif -#ifdef FUSION_CONVADD_OP -LOAD_OP2(fusion_conv_add, CPU, GPU_CL); -LOAD_FUSION_MATCHER(fusion_conv_add); -#endif -#ifdef SOFTMAX_OP -LOAD_OP2(softmax, CPU, GPU_CL); -#endif -#ifdef SHAPE_OP -LOAD_OP1(shape, CPU); -#endif -#ifdef DEPTHWISECONV_OP -LOAD_OP2(depthwise_conv2d, CPU, GPU_CL); -#endif -#ifdef CONV_TRANSPOSE_OP -LOAD_OP2(conv2d_transpose, CPU, GPU_CL); -#endif -#ifdef SCALE_OP -LOAD_OP2(scale, CPU, GPU_CL); -#endif -#ifdef ELEMENTWISEADD_OP -LOAD_OP2(elementwise_add, CPU, GPU_CL); -#endif -#ifdef PRELU_OP -LOAD_OP1(prelu, CPU); -#endif -#ifdef TANH_OP -LOAD_OP2(tanh, CPU, GPU_CL); -#endif -#ifdef FLATTEN_OP -LOAD_OP1(flatten, CPU); -#endif -#ifdef FLATTEN2_OP -LOAD_OP2(flatten2, CPU, GPU_CL); -#endif -#ifdef FUSION_CONVBNADDRELU_OP -LOAD_OP3(fusion_conv_bn_add_relu, CPU, GPU_CL, FPGA); -LOAD_FUSION_MATCHER(fusion_conv_bn_add_relu); -#endif -#ifdef FUSION_CONVBNRELU_OP -LOAD_OP3(fusion_conv_bn_relu, CPU, GPU_CL, FPGA); -LOAD_FUSION_MATCHER(fusion_conv_bn_relu); -#endif -#ifdef FUSION_CONVRELU_OP -LOAD_OP2(fusion_conv_relu, CPU, GPU_CL); -LOAD_FUSION_MATCHER(fusion_conv_relu); -#endif -#ifdef GRU_OP -LOAD_OP1(gru, CPU); -#endif -#ifdef GRU_UNIT_OP -LOAD_OP1(gru_unit, CPU); -#endif -#ifdef FUSION_CONVADDBN_OP -LOAD_OP2(fusion_conv_add_bn, CPU, FPGA); -LOAD_FUSION_MATCHER(fusion_conv_add_bn); -#endif -#ifdef DROPOUT_OP -LOAD_OP3(dropout, CPU, GPU_CL, FPGA); -#endif -#ifdef FUSION_DWCONVBNRELU_OP -LOAD_OP2(fusion_dwconv_bn_relu, CPU, GPU_CL); -LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu); -#endif -#ifdef CRF_OP -LOAD_OP1(crf_decoding, CPU); -#endif -#ifdef MUL_OP -LOAD_OP2(mul, CPU, GPU_CL); -#endif -#ifdef NORM_OP -LOAD_OP1(norm, CPU); -#endif -#ifdef RELU_OP -LOAD_OP2(relu, CPU, GPU_CL); -LOAD_OP2(relu6, CPU, GPU_CL); -#endif -#ifdef IM2SEQUENCE_OP -LOAD_OP1(im2sequence, CPU); -#endif -#ifdef LOOKUP_OP -LOAD_OP1(lookup_table, CPU); -#endif -#ifdef FUSION_FC_OP -LOAD_OP3(fusion_fc, CPU, GPU_CL, FPGA); -LOAD_FUSION_MATCHER(fusion_fc); -#endif -#ifdef POOL_OP -LOAD_OP3(pool2d, CPU, GPU_CL, FPGA); -#endif -#ifdef MULTICLASSNMS_OP -LOAD_OP2(multiclass_nms, CPU, GPU_CL); -#endif -#ifdef POLYGONBOXTRANSFORM_OP -LOAD_OP1(polygon_box_transform, CPU); -#endif -#ifdef SUM_OP -LOAD_OP1(sum, CPU); -#endif -#ifdef ELEMENTWISEMUL_OP -LOAD_OP1(elementwise_mul, CPU); -#endif -#ifdef SLICE_OP -LOAD_OP1(slice, CPU); -#endif -#ifdef FUSION_CONVBN_OP -LOAD_OP2(fusion_conv_bn, CPU, FPGA); -LOAD_FUSION_MATCHER(fusion_conv_bn); -#endif -#ifdef ELEMENTWISESUB_OP -LOAD_OP2(elementwise_sub, CPU, GPU_CL) -#endif -#ifdef TOP_K_OP -LOAD_OP1(top_k, CPU) -#endif -#ifdef CAST_OP -LOAD_OP1(cast, CPU) -#endif -#ifdef QUANT_OP -LOAD_OP1(quantize, CPU); -#endif -#ifdef DEQUANT_OP -LOAD_OP1(dequantize, CPU); -#endif -#ifdef FUSION_DEQUANT_BN_OP -LOAD_OP1(fusion_dequant_bn, CPU); -LOAD_FUSION_MATCHER(fusion_dequant_bn); -#endif -#ifdef FUSION_DEQUANT_ADD_BN_OP -LOAD_OP1(fusion_dequant_add_bn, CPU); -LOAD_FUSION_MATCHER(fusion_dequant_add_bn); -#endif -#ifdef FUSION_DEQUANT_BN_RELU_OP -LOAD_OP1(fusion_dequant_bn_relu, CPU); -LOAD_FUSION_MATCHER(fusion_dequant_bn_relu); -#endif -#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP -LOAD_OP1(fusion_dequant_add_bn_relu, CPU); -LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu); -#endif -#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP -LOAD_OP1(fusion_dequant_add_bn_quant, CPU); -LOAD_FUSION_MATCHER(fusion_dequant_add_bn_quant); -#endif -#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP -LOAD_OP1(fusion_dequant_add_bn_relu_quant, CPU); -LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant); -#endif -#ifdef SEQUENCE_EXPAND_OP -LOAD_OP1(sequence_expand, CPU); -#endif -#ifdef SEQUENCE_POOL_OP -LOAD_OP1(sequence_pool, CPU); -#endif -#ifdef SEQUENCE_SOFTMAX_OP -LOAD_OP1(sequence_softmax, CPU); -#endif -#ifdef LOG_OP -LOAD_OP1(log, CPU); -#endif -#ifdef LOD_RESET_OP -LOAD_OP1(lod_reset, CPU); -#endif -#ifdef LESS_THAN_OP -LOAD_OP1(less_than, CPU); -#endif -#ifdef LOGICAL_AND_OP -LOAD_OP1(logical_and, CPU); -#endif -#ifdef LOGICAL_OR_OP -LOAD_OP1(logical_or, CPU); -#endif -#ifdef LOGICAL_NOT_OP -LOAD_OP1(logical_not, CPU); -#endif -#ifdef LOGICAL_XOR_OP -LOAD_OP1(logical_xor, CPU); -#endif -#ifdef WHILE_OP -LOAD_OP1(while, CPU); -#endif -#ifdef WRITE_TO_ARRAY_OP -LOAD_OP1(write_to_array, CPU); -#endif -#ifdef READ_FROM_ARRAY_OP -LOAD_OP1(read_from_array, CPU); -#endif -#ifdef IS_EMPTY_OP -LOAD_OP1(is_empty, CPU); -#endif -#ifdef INCREMENT_OP -LOAD_OP1(increment, CPU); -#endif -#ifdef ANCHOR_GENERATOR_OP -LOAD_OP1(anchor_generator, CPU); -#endif -#ifdef PROPOSAL_OP -LOAD_OP1(generate_proposals, CPU); -#endif -#ifdef PSROI_POOL_OP -LOAD_OP1(psroi_pool, CPU); -#endif -#ifdef ROI_PERSPECTIVE_OP -LOAD_OP1(roi_perspective_transform, CPU); -#endif -#ifdef BEAM_SEARCH_OP -LOAD_OP1(beam_search, CPU); -#endif -#ifdef BEAM_SEARCH_DECODE_OP -LOAD_OP1(beam_search_decode, CPU); -#endif -#ifdef PAD2D_OP -LOAD_OP1(pad2d, CPU); -#endif -#ifdef ONE_HOT_OP -LOAD_OP1(one_hot, CPU); -#endif -#ifdef ASSIGN_VALUE_OP -LOAD_OP2(assign_value, CPU, GPU_CL); -#endif -#ifdef EXP_OP -LOAD_OP1(exp, CPU); -#endif -#ifdef ASSIGN_OP -LOAD_OP1(assign, CPU); -#endif -#ifdef CONDITIONAL_BLOCK_OP -LOAD_OP1(conditional_block, CPU); -#endif -#ifdef EQUAL_OP -LOAD_OP1(equal, CPU); -#endif -#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP -LOAD_OP1(fill_constant_batch_size_like, CPU); -#endif -#ifdef RANGE_OP -LOAD_OP1(range, CPU); -#endif -#ifdef REDUCE_PROD_OP -LOAD_OP1(reduce_prod, CPU); -#endif -#ifdef PIXEL_SHUFFLE_OP -LOAD_OP1(pixel_shuffle, GPU_CL); -#endif -#ifdef EXPAND_OP -LOAD_OP1(expand, GPU_CL); -#endif -#ifdef GRID_SAMPLER_OP -LOAD_OP1(grid_sampler, GPU_CL); -#endif diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp deleted file mode 100644 index 2e422a3b32..0000000000 --- a/mobile/src/framework/loader.cpp +++ /dev/null @@ -1,310 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/loader.h" -#include - -#include "framework/lod_tensor.h" -#include "framework/program/program-optimize/program_optimize.h" -#ifdef PADDLE_MOBILE_CL -#include "framework/cl/cl_image.h" -#endif - -namespace paddle_mobile { -namespace framework { - -template -void Loader::InitMemoryFromProgram( - const std::shared_ptr &originProgramDesc, - const std::shared_ptr &scope) { - for (const auto &block : originProgramDesc.get()->Blocks()) { - for (const auto &var_desc : block->Vars()) { - auto var = scope.get()->Var(var_desc->Name()); - if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { - if (var_desc->Persistable()) { - auto dim = var_desc->Tensor_desc().Dims(); - auto tensor = var->GetMutable(); - tensor->Resize(make_ddim(dim)); - } else { - auto dim = var_desc->Tensor_desc().Dims(); - if (dim.size() == 0) { - auto tensor = var->GetMutable(); - framework::DDim dDim = {0}; - tensor->Resize(dDim); - } else { - for (auto &d : dim) { - if (d < 0) { - d *= -1; - } - } - auto tensor = var->GetMutable(); - tensor->Resize(make_ddim(dim)); - } - } - } else { - // TODO(codeWorm) - } - } - } -} - -#ifdef PADDLE_MOBILE_CL -template <> -void Loader::InitMemoryFromProgram( - const std::shared_ptr &originProgramDesc, - const std::shared_ptr &scope) { - for (const auto &block : originProgramDesc.get()->Blocks()) { - for (const auto &var_desc : block->Vars()) { - auto var = scope.get()->Var(var_desc->Name()); - if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { - if (var_desc->Persistable()) { - auto dim = var_desc->Tensor_desc().Dims(); - auto cl_image = var->GetMutable(); - cl_image->Resize(make_ddim(dim)); - } else { - auto dim = var_desc->Tensor_desc().Dims(); - PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0"); - if (dim.size() == 0) { - auto tensor = var->GetMutable(); - framework::DDim dDim = {0}; - tensor->Resize(dDim); - } else { - for (auto &d : dim) { - if (d < 0) { - d *= -1; - } - } - } - auto cl_image = var->GetMutable(); - cl_image->Resize(make_ddim(dim)); - } - } else { - // TODO(codeWorm) - } - } - } -} -template <> -const Program Loader::LoadCombinedMemory( - size_t read_size, const uint8_t *buf, size_t combined_params_len, - uint8_t *combined_params_buf, bool optimize, bool quantification, - int quantification_fold) { - bool can_add_split = false; - - PaddleMobile__Framework__Proto__ProgramDesc *c_program; - PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null"); - - c_program = paddle_mobile__framework__proto__program_desc__unpack( - nullptr, read_size, buf); - // - PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null"); - // - DLOG << "n_ops: " << (*c_program->blocks)->n_ops; - // - - auto originProgramDesc = std::make_shared(c_program); - - Program program; - program.combined = true; - program.originProgram = originProgramDesc; - program.quantification = quantification; - program.combined_params_len = combined_params_len; - program.combined_params_buf = combined_params_buf; - program.quantification_fold = quantification_fold; - - auto scope = std::make_shared(); - program.scope = scope; - InitMemoryFromProgram(originProgramDesc, scope); - if (optimize) { - ProgramOptimize program_optimize; - program.optimizeProgram = - program_optimize.FusionOptimize(originProgramDesc, can_add_split); - if (!program.optimizeProgram) { - program.optimizeProgram = originProgramDesc; - } - } - if (optimize) { - program.optimizeProgram->Description("optimize: "); - } else { - originProgramDesc->Description("program: "); - } - paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, - nullptr); - return program; -} - -#endif - -/** - * fusion and print someinfos - * @tparam Device - * @tparam P - * @param optimize - * @param can_add_split - * @param program - * @param originProgramDesc - */ -template -void FusionAndPrintInfos( - bool optimize, bool can_add_split, Program *program, - const std::shared_ptr &originProgramDesc) { - if (optimize) { - ProgramOptimize program_optimize; - program->optimizeProgram = - program_optimize.FusionOptimize(originProgramDesc, can_add_split); - if (!program->optimizeProgram) { - program->optimizeProgram = originProgramDesc; - } - } - if (optimize) { - program->optimizeProgram->Description("optimize: "); - } else { - originProgramDesc->Description("program: "); - } -} - -static size_t ReadBuffer(const char *file_name, uint8_t **out) { - FILE *fp; - fp = fopen(file_name, "rb"); - PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name); - - fseek(fp, 0, SEEK_END); - size_t size = ftell(fp); - rewind(fp); - - DLOG << "model size: " << size; - PADDLE_MOBILE_ENFORCE(size > 0, "model size should > 0") - *out = reinterpret_cast(malloc(size)); - - size_t cur_len = 0; - size_t nread; - while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) { - cur_len += nread; - } - fclose(fp); - return cur_len; -} - -template -const Program Loader::Load(const std::string &dirname, - bool optimize, - bool quantification, - bool can_add_split, - int quantification_fold) { - auto program = - this->LoadProgram(dirname + "/__model__", optimize, quantification, - can_add_split, quantification_fold); - program.model_path = dirname; - return program; -} - -template -const Program Loader::Load(const std::string &model_path, - const std::string ¶_path, - bool optimize, - bool quantification, - int quantification_fold) { - auto program = this->LoadProgram(model_path, optimize, quantification, false, - quantification_fold); - - program.para_path = para_path; - program.combined = true; - program.quantification = quantification; - return program; -} - -template -const Program Loader::LoadProgram( - const std::string &model_path, bool optimize, bool quantification, - bool can_add_split, int quantification_fold) { - std::string model_filename = model_path; - PaddleMobile__Framework__Proto__ProgramDesc *c_program; - uint8_t *buf = NULL; - size_t read_size = ReadBuffer(model_filename.c_str(), &buf); - - PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null"); - - c_program = paddle_mobile__framework__proto__program_desc__unpack( - NULL, read_size, buf); - // - PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null"); - // - DLOG << "n_ops: " << (*c_program->blocks)->n_ops; - // - auto originProgramDesc = std::make_shared(c_program); - - Program program; - program.originProgram = originProgramDesc; - program.quantification = quantification; - program.combined_params_len = 0; - program.combined_params_buf = nullptr; - program.quantification_fold = quantification_fold; - auto scope = std::make_shared(); - program.scope = scope; - - // use originProgramDesc and scope to init tensors - InitMemoryFromProgram(originProgramDesc, scope); - // perform fusion and print infos - FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); - - paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL); - free(buf); - return program; -} - -template -const Program Loader::LoadCombinedMemory( - size_t read_size, const uint8_t *buf, size_t combined_params_len, - uint8_t *combined_params_buf, bool optimize, bool quantification, - int quantification_fold) { - bool can_add_split = false; - - PaddleMobile__Framework__Proto__ProgramDesc *c_program; - PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null"); - - c_program = paddle_mobile__framework__proto__program_desc__unpack( - nullptr, read_size, buf); - // - PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null"); - // - DLOG << "n_ops: " << (*c_program->blocks)->n_ops; - // - - auto originProgramDesc = std::make_shared(c_program); - - Program program; - program.combined = true; - program.originProgram = originProgramDesc; - program.quantification = quantification; - program.combined_params_len = combined_params_len; - program.combined_params_buf = combined_params_buf; - program.quantification_fold = quantification_fold; - - auto scope = std::make_shared(); - program.scope = scope; - InitMemoryFromProgram(originProgramDesc, scope); - FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); - paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, - nullptr); - return program; -} - -template class Loader; - -template class Loader; - -template class Loader; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/loader.h b/mobile/src/framework/loader.h deleted file mode 100644 index 40ded643d5..0000000000 --- a/mobile/src/framework/loader.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "common/types.h" -#include "framework/program/program.h" - -namespace paddle_mobile { -namespace framework { - -template -class Loader { - public: - /* - * @b load separate format fluid model - * @b 加载分开存储的fluid模型 - * */ - const Program Load(const std::string &dirname, - bool optimize = false, - bool quantification = false, - bool can_add_split = false, - int quantification_fold = 1); - - /* - * @b load combine format fluid mode - * @b 加载统一存储的fluid模型 - * */ - const Program Load(const std::string &model_path, - const std::string ¶_path, - bool optimize = false, - bool quantification = false, - int quantification_fold = 1); - - const Program LoadCombinedMemory( - size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - uint8_t *combined_params_buf, bool optimize = false, - bool quantification = false, int quantification_fold = 1); - - private: - const Program LoadProgram(const std::string &model_path, - bool optimize = false, - bool quantification = false, - bool can_add_split = false, - int quantification_fold = 1); - - void InitMemoryFromProgram( - const std::shared_ptr &originProgramDesc, - const std::shared_ptr &scope); -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/lod_tensor.cpp b/mobile/src/framework/lod_tensor.cpp deleted file mode 100644 index 0a1a6f881d..0000000000 --- a/mobile/src/framework/lod_tensor.cpp +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/lod_tensor.h" -#include - -namespace paddle_mobile { -namespace framework { - -LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, - size_t elem_end) { - PADDLE_MOBILE_ENFORCE(level < in.size(), "level should >= in.size()"); - PADDLE_MOBILE_ENFORCE(elem_end < in[level].size(), - "elem_end >= in[level].size()"); - LoD res; - res.resize(in.size() - level); - // copy the first level - res[0].assign(in[level].begin() + elem_begin, - in[level].begin() + elem_end + 1); - for (size_t lvl = 1; lvl < res.size(); lvl++) { - const auto &in_level = in[level + lvl]; - const auto &above_level = res[lvl - 1]; - auto &out_level = res[lvl]; - out_level.assign(in_level.begin() + above_level.front(), - in_level.begin() + above_level.back() + 1); - } - for (size_t lvl = 0; lvl < res.size(); lvl++) { - // to make the first offset equals 0, all the elements minus the - // first - // element - size_t front = res[lvl].front(); - for (auto &ele : res[lvl]) { - ele -= front; - } - } - return res; -} - -LoD ToAbsOffset(const LoD &in) { - // the lowest level stores relative offsets - if (in.empty() || in.size() == 1) return in; - LoD result = in; - for (auto level = static_cast(in.size() - 2); level >= 0; level--) { - for (size_t i = 0; i < in[level].size(); ++i) { - size_t index = in[level][i]; - result[level][i] = result[level + 1][index]; - } - } - return result; -} - -bool operator==(const LoD &a, const LoD &b) { - if (a.size() != b.size()) { - return false; - } - - for (size_t i = 0; i < a.size(); i++) { - const auto &a_level = a[i]; - const auto &b_level = b[i]; - if (a_level.size() != b_level.size()) { - return false; - } - for (size_t j = 0; j < a_level.size(); j++) { - if (a_level[j] != b_level[j]) { - return false; - } - } - } - return true; -} - -bool CheckLoD(const LoD &in, int tensor_height) { - if (in.empty()) return true; - for (const auto &level : in) { - // check: there should be more than 2 offsets existing in each - // level. - if (level.size() < 2) return false; - // check: the first offset(the begin offset) of each level - // should be 0. - if (level.front() != 0) return false; - // check: all the offsets in a level should be ascending(no same - // items - // allows). - if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) { - if (a < b) return true; - return false; - })) { - PADDLE_MOBILE_THROW_EXCEPTION("ascending error") - return false; - } - } - // check: the lowest level's last offset should equals - // `tensor_height` if - // tensor_height>0. - if (tensor_height > 0 && (size_t)tensor_height != in.back().back()) - return false; - - // check: the higher level's last offset should equals the lower - // level's - // size-1. - // NOTE LoD store the levels from top to bottom, so the higher level - // goes - // first. - for (size_t level = 0; level < in.size() - 1; level++) { - if (in[level].back() != in[level + 1].size() - 1) return false; - } - return true; -} - -bool CheckAbsLoD(const LoD &in, int tensor_height) { - if (in.empty()) return true; - for (const auto &level : in) { - // check: all the offsets in a level should be ascending(no same - // items - // allows). - if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) { - if (a < b) return true; - return false; - })) { - return false; - } - - // check: there should be more than 2 offsets existing in each - // level. - if (level.size() < 2) return false; - - // check: the first offset of each level should be 0, and the - // last should be - // the same(the height of underlying tensor). - if (level.front() != 0) return false; - if (tensor_height < 0) { - tensor_height = level.back(); - } else if ((size_t)tensor_height != level.back()) { - return false; - } - } - return true; -} - -using LoDAndOffset = std::pair>; - -LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, - size_t end_idx, size_t start_level) { - LoD sub_lod; - - for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) { - PADDLE_MOBILE_ENFORCE(start_idx <= end_idx, "start_idx > end_idx"); - PADDLE_MOBILE_ENFORCE(end_idx < lod[level_idx].size(), - "end_idx >= lod[level_idx].size()"); - std::vector level_lens; - for (size_t i = start_idx; i < end_idx; ++i) { - level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]); - } - sub_lod.emplace_back(level_lens); - start_idx = lod[level_idx][start_idx]; - end_idx = lod[level_idx][end_idx]; - } - - return LoDAndOffset{sub_lod, {start_idx, end_idx}}; -} - -void AppendLoD(LoD *lod, const LoD &lod_length) { - PADDLE_MOBILE_ENFORCE( - lod->empty() || lod->size() == lod_length.size(), - "The lod_length should has the same size with the appended lod."); - if (lod->empty()) { - for (size_t i = 0; i < lod_length.size(); ++i) { - lod->emplace_back(1, 0); // size = 1, value = 0; - } - *lod = LoD(lod_length.size(), std::vector({0})); - } - for (size_t i = 0; i < lod->size(); ++i) { - auto &level = (*lod)[i]; - for (size_t len : lod_length[i]) { - level.push_back(level.back() + len); - } - } -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/lod_tensor.h b/mobile/src/framework/lod_tensor.h deleted file mode 100644 index 6d67b517ff..0000000000 --- a/mobile/src/framework/lod_tensor.h +++ /dev/null @@ -1,234 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include "framework/tensor.h" -#include "framework/tensor_util.h" - -namespace paddle_mobile { -namespace framework { - -/* - * LoD is short for Level of Details. - * - * - in a level, each element indicates relative offset of the lower - * level - * - the first element should be 0 and that indicates that this sequence - * start - * from 0 - * - each sequence's begin and end(no-inclusive) is level[id, id+1] - * - * For example: - * 3-level LoD stores - * - * 0 2 3 - * 0 2 4 7 - * 0 2 5 7 10 12 15 20 - */ -using LoD = std::vector>; - -std::ostream &operator<<(std::ostream &os, const LoD &lod); - -std::ostream &operator<<(std::ostream &os, const LoDTensor &t); - -std::string LoDToString(const LoD &lod); - -LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, - size_t elem_end); - -/* - * Transform an LoD from relative offsets to absolute offsets. - */ -LoD ToAbsOffset(const LoD &in); - -bool operator==(const LoD &a, const LoD &b); - -/* - * Check whether this lod's format is valid. - * - * ATTENTION: - * - Empty lod is treated as valid. - * - * It will check two things: - * - * 1. all the offsets in a level should be ascending(no same items - * allows). - * 2. there should be more than 2 offsets existing in each level. - * 3. the higher level's last offset should equals the lower level's - * size-1. - * 4. the first offset(the begin offset) of each level should be 0. - * 5. the lowest level's last offset should equals `tensor_height` if - * tensor_height>0. - */ - -bool CheckLoD(const LoD &in, int tensor_height = -1); - -/* - * Check whether this absolute lod's format is valid. - * - * ATTENTION: - * - Empty lod is treated as valid. - * - * It will check two things: - * 1. all the offsets in a level should be ascending(no same items - * allows) - * 2. there should be more than 2 offsets existing in each level. - * 3. the first offset of each level should be 0, and the last should - * be the - * same(the height of underlying tensor) or `tensor_height` if - * tensor_height>0. - */ -bool CheckAbsLoD(const LoD &in, int tensor_height = -1); - -/* - * LoDTensor (Level of details Tensor) - * see https://en.wikipedia.org/wiki/Level_of_details for reference. - */ -class LoDTensor : public Tensor { - public: - LoDTensor() : Tensor() {} - - explicit LoDTensor(const LoD &lod) : lod_(lod) {} - - void set_lod(const LoD &lod) { lod_ = lod; } - - const LoD &lod() const { return lod_; } - - LoD *mutable_lod() { return &lod_; } - - /* - * Get the start offset and end offset of an element from LoD. - */ - std::pair lod_element(size_t level, size_t elem) const { - // PADDLE_ENFORCE_LT(level, NumLevels()); - // PADDLE_ENFORCE_LT(elem, NumElements(level)); - return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]); - } - - /* - * Number of LoDTensor's levels, each level has units of data, for - * example, - * in the sentence's view, article, paragraph, sentence are 3 - * levels. - */ - size_t NumLevels() const { return lod_.size(); } - - /* - * Number of elements in a level. - */ - size_t NumElements(size_t level = 0) const { - // PADDLE_ENFORCE_LT(level, NumLevels()); - // the last offset is the end of last element - return (lod_)[level].size() - 1; - } - - private: - LoD lod_; -}; - -/* - * Expand the `source` to fit the LoD of `lod`. For example, a `source` - * LoDTensor is - * - LoD: [0, 2] - * - tensor: [a0, a1] - * a `lod` is - * - LoD: [0 3 5] - * returns a new LoDTensor - * - [a0 a0 a0 a1 a1] - */ -template -LoDTensor LodExpand(const LoDTensor &source, const LoD &lod, size_t level) { - LoD abs_lod = ToAbsOffset(lod); - const auto &lod_level = lod[level]; - size_t num_instances = source.dims()[0]; - - // new tensor - LoDTensor tensor; - tensor.set_lod(lod); - auto dims = source.dims(); - dims[0] = lod_level.back(); - tensor.Resize(dims); - tensor.mutable_data(); - - // PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1); - for (size_t ins = 0; ins < num_instances; ins++) { - for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) { - auto slice = tensor.Slice(elem, elem + 1); - TensorCopy(source.Slice(ins, ins + 1), &slice); - } - } - return tensor; -} - -using LoDTensorArray = std::vector; - -// Get the absolute offset of a lod[start_level][start_idx:end_idx] and -// relative length of details for every levels(i.e., [start_level: ]). -// -// For example, -// lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]] -// start_level = 0 -// start_idx = 1 -// end_idx = 3 -// -// Returns: -// LoD = [[1, 4], [2, 4, 2, 3, 2]] -// pair = {11, 24} -std::pair> GetSubLoDAndAbsoluteOffset( - const LoD &lod, size_t start_idx, size_t end_idx, size_t start_level); - -void AppendLoD(LoD *lod, const LoD &lod_length); - -/* - * Serialize/Desiralize LoDTensor to std::ostream - * You can pass ofstream or ostringstream to serilize to file - * or to a in memory string. GPU tensor will be copied to CPU. - */ -void SerializeToStream(std::ostream &os, const LoDTensor &tensor); - -void DeserializeFromStream(std::istream &is, LoDTensor *tensor); - -#ifdef PADDLE_MOBILE_DEBUG -inline Print &operator<<(Print &printer, const LoDTensor &tensor) { - printer << " dims: " << tensor.dims() << "\n"; - int stride = tensor.numel() / 20; - stride = stride > 0 ? stride : 1; -#ifndef PADDLE_MOBILE_FPGA - for (int i = 0; i < tensor.numel(); i += stride) { - if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } else if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } else if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } else if (tensor.type() == type_id()) { - printer << static_cast(tensor.data()[i]) << " "; - } else if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } else if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } - } -#endif // PADDLE_MOBILE_FPGA - return printer; -} -#endif // PADDLE_MOBILE_DEBUG - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/mixed_vector.h b/mobile/src/framework/mixed_vector.h deleted file mode 100644 index 6e46164fb7..0000000000 --- a/mobile/src/framework/mixed_vector.h +++ /dev/null @@ -1,271 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "framework/tensor.h" -#include "framework/tensor_util.h" - -namespace paddle_mobile { -namespace framework { - -// Vector implements the std::vector interface, and can get Data or -// MutableData from any place. The data will be synced implicitly inside. -template -class Vector { - public: - using value_type = T; - // Default ctor. Create empty Vector - Vector() { InitEmpty(); } - - // Fill vector with value. The vector size is `count`. - explicit Vector(size_t count, const T& value = T()) { - InitEmpty(); - if (count != 0) { - resize(count); - T* ptr = begin(); - for (size_t i = 0; i < count; ++i) { - ptr[i] = value; - } - } - } - - // Ctor with init_list - Vector(std::initializer_list init) { - if (init.size() == 0) { - InitEmpty(); - } else { - InitByIter(init.size(), init.begin(), init.end()); - } - } - - // implicit cast from std::vector. - template - Vector(const std::vector& dat) { // NOLINT - if (dat.size() == 0) { - InitEmpty(); - } else { - InitByIter(dat.size(), dat.begin(), dat.end()); - } - } - - // Copy ctor - Vector(const Vector& other) { this->operator=(other); } - - // Copy operator - Vector& operator=(const Vector& other) { - if (other.size() != 0) { - this->InitByIter(other.size(), other.begin(), other.end()); - } else { - InitEmpty(); - } - return *this; - } - - // Move ctor - Vector(Vector&& other) { - this->size_ = other.size_; - this->flag_ = other.flag_; - if (other.cuda_vec_.memory_size()) { - this->cuda_vec_.ShareDataWith(other.cuda_vec_); - } - if (other.cpu_vec_.memory_size()) { - this->cpu_vec_.ShareDataWith(other.cpu_vec_); - } - } - - // CPU data access method. Mutable. - T& operator[](size_t i) { - MutableCPU(); - return const_cast(cpu_vec_.data())[i]; - } - - // CPU data access method. Immutable. - const T& operator[](size_t i) const { - // ImmutableCPU(); - return cpu_vec_.data()[i]; - } - - // std::vector iterator methods. Based on CPU data access method - size_t size() const { return size_; } - - T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } - - T* end() { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } - - T& front() { return *begin(); } - - T& back() { - auto it = end(); - --it; - return *it; - } - - const T* begin() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); - } - - const T* end() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } - - const T* cbegin() const { return begin(); } - - const T* cend() const { return end(); } - - const T& back() const { - auto it = end(); - --it; - return *it; - } - - T* data() { return begin(); } - - const T* data() const { return begin(); } - - const T& front() const { return *begin(); } - // end of std::vector iterator methods - - // assign this from iterator. - // NOTE: the iterator must support `end-begin` - template - void assign(Iter begin, Iter end) { - InitByIter(end - begin, begin, end); - } - - // push_back. If the previous capacity is not enough, the memory will - // double. - void push_back(T elem) { - if (size_ + 1 > capacity()) { - reserve((size_ + 1) << 1); - } - *end() = elem; - ++size_; - } - - // extend a vector by iterator. - // NOTE: the iterator must support end-begin - template - void Extend(It begin, It end) { - size_t pre_size = size_; - resize(pre_size + (end - begin)); - T* ptr = this->begin() + pre_size; - for (; begin < end; ++begin, ++ptr) { - *ptr = *begin; - } - } - - // resize the vector - void resize(size_t size) { - if (size + 1 <= capacity()) { - size_ = size; - } else { - MutableCPU(); - Tensor cpu_tensor; - T* ptr = cpu_tensor.mutable_data( - framework::make_ddim({static_cast(size)})); - const T* old_ptr = - cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); - if (old_ptr != nullptr) { - std::copy(old_ptr, old_ptr + size_, ptr); - } - size_ = size; - cpu_vec_.ShareDataWith(cpu_tensor); - } - } - - // clear - void clear() { - size_ = 0; - flag_ = kDirty | kDataInCPU; - } - - size_t capacity() const { - return cpu_vec_.memory_size() / SizeOfType(type_id().hash_code()); - } - - // reserve data - void reserve(size_t size) { - size_t pre_size = size_; - resize(size); - resize(pre_size); - } - - // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { - std::vector result; - result.resize(size()); - std::copy(begin(), end(), result.begin()); - return result; - } - - bool operator==(const Vector& other) const { - if (size() != other.size()) return false; - auto it1 = cbegin(); - auto it2 = other.cbegin(); - for (; it1 < cend(); ++it1, ++it2) { - if (*it1 != *it2) { - return false; - } - } - return true; - } - - private: - void InitEmpty() { - size_ = 0; - flag_ = kDataInCPU; - } - - template - void InitByIter(size_t size, Iter begin, Iter end) { - T* ptr = this->cpu_vec_.template mutable_data( - framework::make_ddim({static_cast(size)})); - for (size_t i = 0; i < size; ++i) { - *ptr++ = *begin++; - } - flag_ = kDataInCPU | kDirty; - size_ = size; - } - - enum DataFlag { - kDataInCPU = 0x01, - kDataInCUDA = 0x02, - // kDirty means the data has been changed in one device. - kDirty = 0x10 - }; - - void MutableCPU() { flag_ = kDirty | kDataInCPU; } - - void UnsetFlag(int flag) const { flag_ &= ~flag; } - void SetFlag(int flag) const { flag_ |= flag; } - - static T& EmptyDummy() { - static T dummy = T(); - return dummy; - } - - mutable int flag_; - mutable Tensor cpu_vec_; - mutable Tensor cuda_vec_; - size_t size_; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/op_info.h b/mobile/src/framework/op_info.h deleted file mode 100644 index c250f61664..0000000000 --- a/mobile/src/framework/op_info.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "common/log.h" -#include "common/type_define.h" -#include "framework/scope.h" - -namespace paddle_mobile { -namespace framework { - -template -class OperatorBase; - -template -using OpCreator = std::function *( - const std::string & /*type*/, const VariableNameMap & /*inputs*/, - const VariableNameMap & /*outputs*/, - const framework::AttributeMap & /*attrs*/, framework::Scope * /*scope*/)>; - -template -struct OpInfo { - OpCreator creator_; - const OpCreator &Creator() const { - PADDLE_MOBILE_ENFORCE(creator_ != nullptr, - "Operator Creator has not been registered"); - return creator_; - } -}; - -template -class OpInfoMap { - public: - static OpInfoMap *Instance() { - static OpInfoMap *s_instance = nullptr; - if (s_instance == nullptr) { - s_instance = new OpInfoMap(); - } - return s_instance; - } - - bool Has(const std::string &op_type) const { - return map_.find(op_type) != map_.end(); - } - - void Insert(const std::string &type, const OpInfo &info) { - PADDLE_MOBILE_ENFORCE(!Has(type), "Operator %s has been registered", - type.c_str()); - map_.insert({type, info}); - } - - const OpInfo &Get(const std::string &type) const { - auto op_info_ptr = GetNullable(type); - PADDLE_MOBILE_ENFORCE(op_info_ptr != nullptr, - "Operator %s has not been registered", type.c_str()); - return *op_info_ptr; - } - - const OpInfo *GetNullable(const std::string &type) const { - auto it = map_.find(type); - if (it == map_.end()) { - return nullptr; - } else { - return &it->second; - } - } - - const std::unordered_map> &map() const { - return map_; - } - - std::unordered_map> *mutable_map() { - return &map_; - } - - private: - OpInfoMap() = default; - std::unordered_map> map_; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/op_kernel_type.h b/mobile/src/framework/op_kernel_type.h deleted file mode 100644 index fd59eb494d..0000000000 --- a/mobile/src/framework/op_kernel_type.h +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/data_layout.h" -#include "framework/program/tensor_desc.h" - -namespace paddle_mobile { -namespace framework { -struct OpKernelType { - struct Hash { - size_t operator()(const OpKernelType &key) const { - int data_type = static_cast(key.data_type_) << LEFT_SHIFT; - int data_layout = static_cast(key.data_layout_) << (LEFT_SHIFT * 2); - - std::hash hasher; - return hasher(data_type + data_layout); - } - }; - - // place, data_type, library_type kinds less than 2^8 - constexpr static int LEFT_SHIFT = 8; - - VarType_Type data_type_; - DataLayout data_layout_; - - OpKernelType(VarType_Type data_type, - DataLayout data_layout = DataLayout::kAnyLayout) - : data_type_(data_type), data_layout_(data_layout) {} - - bool operator==(const OpKernelType &o) const { - return data_type_ == o.data_type_ && data_layout_ == o.data_layout_; - } - - bool operator!=(const OpKernelType &o) const { return !(*this == o); } -}; - -inline bool NeedTransformLayout(const DataLayout &l, const DataLayout &r) { - return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r; -} - -inline bool TransFromNeeded(const OpKernelType &l, const OpKernelType &r) { - return (l.data_type_ != r.data_type_) || - NeedTransformLayout(l.data_layout_, r.data_layout_); -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/op_proto_maker.h b/mobile/src/framework/op_proto_maker.h deleted file mode 100644 index a41e65d357..0000000000 --- a/mobile/src/framework/op_proto_maker.h +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace framework { -// this class not only make proto but also init attribute checkers. -class OpProtoAndCheckerMaker {}; -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/op_registry.h b/mobile/src/framework/op_registry.h deleted file mode 100644 index 3897fc02c8..0000000000 --- a/mobile/src/framework/op_registry.h +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "common/log.h" -#include "common/type_define.h" -#include "framework/op_info.h" -#include "framework/operator.h" - -namespace paddle_mobile { -namespace framework { - -class Registrar { - public: - void Touch() {} -}; - -template -class OperatorRegistrarRecursive; - -template -struct OperatorRegistrar : public Registrar { - explicit OperatorRegistrar(const std::string& op_type) { - if (OpInfoMap::Instance()->Has(op_type)) { - LOG(paddle_mobile::kLOG_DEBUG1) - << op_type << " is registered more than once."; - return; - } - if (sizeof...(ARGS) == 0) { - LOG(paddle_mobile::kLOG_DEBUG1) - << "OperatorRegistrar should be invoked at least by OpClass"; - return; - } - OpInfo info; - OperatorRegistrarRecursive(op_type, &info); - OpInfoMap::Instance()->Insert(op_type, info); - } -}; - -template -struct OpInfoFiller { - void operator()(const std::string& op_type, OpInfo* info) const { - info->creator_ = [](const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, - const AttributeMap& attrs, framework::Scope* scope) { - return new T(type, inputs, outputs, attrs, scope); - }; - } -}; - -template -class OperatorRegistrarRecursive { - public: - using T = typename std::tuple_element>::type; - OperatorRegistrarRecursive(const std::string& op_type, OpInfo* info) { - OpInfoFiller fill; - fill(op_type, info); - constexpr auto size = sizeof...(ARGS); - OperatorRegistrarRecursive reg( - op_type, info); - (void)(reg); - } -}; - -template -class OperatorRegistrarRecursive { - public: - OperatorRegistrarRecursive(const std::string& op_type, OpInfo* info) {} -}; - -template -class OpRegistry { - public: - static std::shared_ptr> CreateOp( - const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap attrs, - paddle_mobile::framework::Scope* scope) { - auto& info = OpInfoMap::Instance()->Get(type); - auto op = info.Creator()(type, inputs, outputs, attrs, scope); - return std::shared_ptr>(op); - } -}; - -#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \ - template class op_class; \ - template \ - class _OpClass_##op_type##_##device_name : public op_class { \ - public: \ - DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class); \ - }; \ - static paddle_mobile::framework::OperatorRegistrar< \ - device_type, _OpClass_##op_type##_##device_name> \ - __op_registrar_##op_type##_##device_name(#op_type); \ - int TouchOpRegistrar_##op_type##_##device_name() { \ - __op_registrar_##op_type##_##device_name.Touch(); \ - return 0; \ - } - -#define REGISTER_OPERATOR_CPU(op_type, op_class) \ - REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU); - -#define REGISTER_OPERATOR_FPGA(op_type, op_class) \ - REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA); - -#define REGISTER_OPERATOR_CL(op_type, op_class) \ - REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL); - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/operator.cpp b/mobile/src/framework/operator.cpp deleted file mode 100644 index a091a49b35..0000000000 --- a/mobile/src/framework/operator.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/operator.h" -#include -#include "operators/op_param.h" -namespace paddle_mobile { -namespace framework { - -template -vector OperatorBase::GetOutKeys() const { - auto it = op_input_output_key.find(type_); - if (it == op_input_output_key.end()) { - DLOG << type_ << " has no outputs"; - return {}; - } - return it->second.second; -} - -template -vector OperatorBase::GetInputKeys() const { - auto it = op_input_output_key.find(type_); - if (it == op_input_output_key.end()) { - DLOG << type_ << " has no inputs"; - return {}; - } - return it->second.first; -} - -template -OperatorBase::OperatorBase(const std::string &type, - const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, - framework::Scope *scope) - : type_(type), - inputs_(inputs), - outputs_(outputs), - attrs_(attrs), - scope_(scope) { - CheckAllInputOutputSet(); -} - -template -void OperatorBase::CheckAllInputOutputSet() const {} - -template -void OperatorBase::Run() { - RunImpl(); -#ifdef PADDLE_MOBILE_DEBUG - DLOG << "-------------" << type_ << "----------------------------"; - vector input_keys = GetInputKeys(); - for (const auto key : input_keys) { - if (inputs_.count(key) > 0) { - auto var_vec_in = inputs_.at(key); - for (int i = 0; i < var_vec_in.size(); ++i) { - auto var = this->scope_->FindVar(var_vec_in[i]); - if (var->IsInitialized() && - var->template IsType()) { - const Tensor *tensor = var->template Get(); - if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; -#ifdef PADDLE_MOBILE_FPGA - DLOG << var_vec_in[i]; -#endif - } - } - } else { - DLOG << "did not find key (" << key << ") in inputs_"; - } - } - for (const auto key : GetOutKeys()) { - if (outputs_.count(key) > 0) { - auto var_vec_out = outputs_.at(key); - for (int i = 0; i < var_vec_out.size(); ++i) { - auto var = scope_->FindVar(var_vec_out[i]); - if (var->IsInitialized() && - var->template IsType()) { - const Tensor *tensor = var->template Get(); - if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; -#ifdef PADDLE_MOBILE_FPGA - DLOG << var_vec_out[i]; -#endif - } - } - } else { - DLOG << "did not find key (" << key << ") in outputs_"; - } - } -#endif -} - -#ifdef PADDLE_MOBILE_CL -template <> -void OperatorBase::Run() { - RunImpl(); -#ifdef PADDLE_MOBILE_DEBUG - DLOG << "-------------" << type_ << "----------------------------"; - vector input_keys = GetInputKeys(); - for (const auto key : input_keys) { - if (inputs_.count(key) > 0) { - auto var_vec_in = inputs_.at(key); - for (int i = 0; i < var_vec_in.size(); ++i) { - auto var = scope_->FindVar(var_vec_in[i]); - if (var->IsInitialized() && - var->template IsType()) { - const CLImage *cl_image = var->template Get(); - if (cl_image) { - DLOG << type_ << " input- " << key << "=" << *cl_image; - } - } - } - } else { - DLOG << "did not find key (" << key << ") in inputs_"; - } - } - for (const auto key : GetOutKeys()) { - if (outputs_.count(key) > 0) { - auto var_vec_out = outputs_.at(key); - for (int i = 0; i < var_vec_out.size(); ++i) { - auto var = scope_->FindVar(var_vec_out[i]); - if (var->IsInitialized() && - var->template IsType()) { - const CLImage *cl_image = var->template Get(); - if (cl_image) { - DLOG << type_ << " output- " << key << "=" << *cl_image; - } - } - } - } else { - DLOG << "did not find key (" << key << ") in outputs_"; - } - } -#endif -} -#endif - -#ifdef PADDLE_MOBILE_FPGA -template -void OperatorBase::InsertTensors() { - static int feed_num = 0; - static int fetch_num = 0; - if (type_ == "feed") { - auto new_name = string("feed") + std::to_string(feed_num++); - auto var = scope_->Var(new_name); - var->template GetMutable(); - inputs_.at("X") = {string(new_name)}; - } else if (type_ == "fetch") { - auto new_name = string("fetch") + std::to_string(fetch_num++); - auto var = scope_->Var(new_name); - var->template GetMutable(); - outputs_.at("Out") = {string(new_name)}; - } -} -#endif - -template class OperatorBase; -template class OperatorBase; -template class OperatorBase; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/operator.h b/mobile/src/framework/operator.h deleted file mode 100644 index baffba97c2..0000000000 --- a/mobile/src/framework/operator.h +++ /dev/null @@ -1,211 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "common/enforce.h" -#include "common/type_define.h" -#include "common/types.h" -#include "common/variant.h" -#include "framework/attribute.h" -#include "framework/op_info.h" -#include "framework/op_kernel_type.h" -#include "framework/op_registry.h" -#include "framework/program/block_desc.h" -#include "framework/program/program-optimize/node.h" -#include "framework/scope.h" -#include "framework/tensor.h" -#include "framework/variable.h" -#ifdef PADDLE_MOBILE_CL -#include "framework/cl/cl_helper.h" -#include "framework/cl/cl_scope.h" -#endif - -namespace paddle_mobile { -namespace framework { - -template -static T *GetVarValue(const std::string &key, const VariableNameMap &var_map, - const Scope &scope) { - auto var_vec = var_map.at(key); - if (!var_vec.empty()) { - auto var = scope.FindVar(var_vec[0]); - return var->GetMutable(); - } else { - return nullptr; - } -} - -template -class OperatorBase { - public: - OperatorBase(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - framework::Scope *scope); - virtual ~OperatorBase() {} - - virtual void Init() = 0; - virtual void InferShape() const = 0; - virtual void Run(); - virtual void RunImpl() = 0; - - std::vector GetOutKeys() const; - std::vector GetInputKeys() const; - - const VariableNameMap &Inputs() const { return inputs_; } - const VariableNameMap &Outputs() const { return outputs_; } - const std::string &Type() const { return type_; } - const AttributeMap &Attrs() const { return attrs_; } - void setPrePostType(int prePostType) { pre_post_type_ = prePostType; } - - void ClearVariables(const std::vector &var_names) const { - if (this->scope_) { - this->scope_->EraseVars(var_names); - } - } -#ifdef PADDLE_MOBILE_FPGA - void InsertTensors(); -#endif - - protected: - framework::Scope *scope_; - std::string type_; - VariableNameMap inputs_; - VariableNameMap outputs_; - AttributeMap attrs_; - int pre_post_type_ = 0; - - private: - void CheckAllInputOutputSet() const; -}; - -template -class OperatorWithKernel : public OperatorBase { - public: - OperatorWithKernel(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - framework::Scope *scope) - : OperatorBase(type, inputs, outputs, attrs, scope), - param_(inputs, outputs, attrs, scope) { -#ifdef PADDLE_MOBILE_CL - kernel_.InitCLHelper(scope->GetCLScpoe()); -#endif - } - virtual void RunImpl() { this->kernel_.Compute(this->param_); } - - virtual void InferShape() const = 0; - - void Init() { - if (this->pre_post_type_ != NONE_PRE_POST) { - kernel_.setPrePostType(this->pre_post_type_); - } - PADDLE_MOBILE_ENFORCE(kernel_.Init(¶m_), " %s kernel init failed", - this->type_.c_str()); - } - - protected: - KernelType kernel_; - ParamType param_; -}; - -template -class OpKernelBase { - public: - OpKernelBase() = default; - -#ifdef PADDLE_MOBILE_CL - virtual void InitCLHelper(CLScope *clScope) { - cl_helper_ = CLHelper(clScope); - } -#endif - - virtual void Compute(const P ¶) = 0; - virtual bool Init(P *para) { return true; } - virtual ~OpKernelBase() = default; - virtual void setPrePostType(int prePostType) { pre_post_type_ = prePostType; } - - protected: -#ifdef PADDLE_MOBILE_CL - CLHelper cl_helper_; -#endif - int pre_post_type_ = 0; - - private: -}; - -class FusionOpMatcher { - public: - FusionOpMatcher() {} - - virtual std::string Type() = 0; - - virtual void FolderNodes( - Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), {}, removed_nodes); - } - - virtual Node &BeginNode() { return node_; } - - std::string BeginType() { return node_.Type(); } - - virtual std::vector> NeedCheck() { return {}; } - - protected: - Node node_; - std::string type_; - std::shared_ptr new_opdesc_; -}; - -#define DECLARE_OPERATOR(OpName, OpParam, OpKernel) \ - template \ - class OpName##Op : public framework::OperatorWithKernel< \ - DeviceType, OpParam, \ - operators::OpKernel> { \ - public: \ - OpName##Op(const std::string &type, const VariableNameMap &inputs, \ - const VariableNameMap &outputs, \ - const framework::AttributeMap &attrs, framework::Scope *scope) \ - : framework::OperatorWithKernel, \ - operators::OpKernel>( \ - type, inputs, outputs, attrs, scope) {} \ - \ - void InferShape() const override; \ - }; - -#define DECLARE_KERNEL(OpName, OpParam) \ - template \ - class OpName##Kernel \ - : public framework::OpKernelBase> { \ - public: \ - bool Init(OpParam *param); \ - void Compute(const OpParam ¶m); \ - }; - -#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \ - cls(const std::string &type, const ::paddle_mobile::VariableNameMap &inputs, \ - const ::paddle_mobile::VariableNameMap &outputs, \ - const ::paddle_mobile::framework::AttributeMap &attrs, \ - ::paddle_mobile::framework::Scope *scope) \ - : parent_cls(type, inputs, outputs, attrs, scope) {} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/block_desc.cpp b/mobile/src/framework/program/block_desc.cpp deleted file mode 100644 index 4e3eb79d07..0000000000 --- a/mobile/src/framework/program/block_desc.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "block_desc.h" -#include - -namespace paddle_mobile { -namespace framework { - -std::vector> BlockDesc::Vars() const { return vars_; } - -std::vector> BlockDesc::Ops() const { return ops_; } - -BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc) - : index_(desc->idx), parent_index_(desc->idx) { - for (int i = 0; i < desc->n_vars; ++i) { - PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i]; - vars_.emplace_back(std::shared_ptr(new VarDesc(var_desc))); - } - - std::sort(vars_.begin(), vars_.end(), - [](std::shared_ptr left, std::shared_ptr right) { - return left->Name() < right->Name(); - }); - - for (int j = 0; j < desc->n_ops; ++j) { - PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j]; - ops_.emplace_back(new framework::OpDesc(op_desc)); - } -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/block_desc.h b/mobile/src/framework/program/block_desc.h deleted file mode 100644 index 86dd832d1b..0000000000 --- a/mobile/src/framework/program/block_desc.h +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "framework/framework.pb-c.h" -#include "framework/program/op_desc.h" -#include "framework/program/var_desc.h" - -namespace paddle_mobile { -namespace framework { - -class BlockDesc { - public: - friend class Node; - friend class ProgramOptimize; - BlockDesc() {} - explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc); - explicit BlockDesc(const BlockDesc &block_desc) - : index_(block_desc.index_), parent_index_(block_desc.parent_index_) { - for (auto &op_desc : block_desc.ops_) { - std::shared_ptr copy_op_desc = std::make_shared(*op_desc); - ops_.push_back(copy_op_desc); - } - - for (int i = 0; i < block_desc.vars_.size(); ++i) { - auto &var_desc = block_desc.vars_[i]; - vars_.emplace_back(std::make_shared(*var_desc)); - } - } - - const int &ID() const { return index_; } - - const bool &MultiThread() const { return multi_thread_; } - - const int &Parent() const { return parent_index_; } - - bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const { - return this->ID() == in_block.ID() && this->Parent() == in_block.Parent(); - } - - bool operator<(const paddle_mobile::framework::BlockDesc &in_block) const { - return this->ID() < in_block.ID() && this->Parent() < in_block.Parent(); - } - - std::vector> Vars() const; - std::vector> Ops() const; - - private: - int index_; - bool multi_thread_; - int parent_index_; - std::vector> ops_; - std::vector> vars_; -}; - -} // namespace framework -} // namespace paddle_mobile - -namespace std { - -template <> -struct hash { - typedef paddle_mobile::framework::BlockDesc argument_type; - typedef std::size_t result_type; - result_type operator()(argument_type const &s) const noexcept { - result_type const h1(std::hash{}(s.ID())); - result_type const h2(std::hash{}(s.ID())); - return h1 ^ (h2 << 1); - } -}; - -} // namespace std diff --git a/mobile/src/framework/program/op_desc.cpp b/mobile/src/framework/program/op_desc.cpp deleted file mode 100644 index ba3105778e..0000000000 --- a/mobile/src/framework/program/op_desc.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "framework/program/op_desc.h" - -namespace paddle_mobile { -namespace framework { - -OpDesc::OpDesc(PaddleMobile__Framework__Proto__OpDesc *desc) { - this->type_ = std::string(desc->type); - for (int i = 0; i < desc->n_inputs; ++i) { - PaddleMobile__Framework__Proto__OpDesc__Var *var = desc->inputs[i]; - std::vector &args = inputs_[std::string(var->parameter)]; - for (int j = 0; j < var->n_arguments; ++j) { - args.emplace_back(std::string(var->arguments[j])); - } - } - - for (int i = 0; i < desc->n_outputs; ++i) { - PaddleMobile__Framework__Proto__OpDesc__Var *var = desc->outputs[i]; - std::vector &args = outputs_[std::string(var->parameter)]; - for (int j = 0; j < var->n_arguments; ++j) { - args.emplace_back(std::string(var->arguments[j])); - } - } - - for (int k = 0; k < desc->n_attrs; ++k) { - PaddleMobile__Framework__Proto__OpDesc__Attr *attr = desc->attrs[k]; - std::string attr_name(attr->name); - attrs_[attr_name] = Attribute::GetAttrValue(attr); - proto_attrs_.push_back(*attr); - } -} - -const std::vector - &OpDesc::GetProtoAttr() const { - return proto_attrs_; -} - -const std::vector &OpDesc::Input(const std::string &name) const { - return inputs_.find(name)->second; -} - -const std::vector &OpDesc::Output(const std::string &name) const { - return outputs_.find(name)->second; -} - -Attribute OpDesc::GetAttr(const std::string &name) const { - auto it = attrs_.find(name); - return it->second; -} - -void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) { - this->attrs_[name].Set(block); -} - -void OpDesc::SetBlocksAttr(const std::string &name, - std::vector blocks) { - this->attrs_[name].Set>(blocks); -} - -std::unordered_map &OpDesc::GetAttrMap() { - return attrs_; -} - -Print &operator<<(Print &printer, const OpDesc &op_desc) { - OpDesc &no_const_op_desc = const_cast(op_desc); - printer << "inputs: \n"; - for (const auto &input : no_const_op_desc.GetInputs()) { - printer << input.first << " : " << input.second << "\n"; - } - - printer << "outputs: \n"; - for (const auto &output : no_const_op_desc.GetOutputs()) { - printer << output.first << " : " << output.second << "\n"; - } - - printer << "outputs: \n"; - for (const auto &attr : no_const_op_desc.GetAttrMap()) { - printer << attr.first << " : " << attr.second << "\n"; - } - return printer; -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/op_desc.h b/mobile/src/framework/program/op_desc.h deleted file mode 100644 index 89c877ba12..0000000000 --- a/mobile/src/framework/program/op_desc.h +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "common/log.h" -#include "common/types.h" -#include "framework/attribute.h" -#include "framework/framework.pb-c.h" - -namespace paddle_mobile { -namespace framework { - -class OpDesc { - public: - friend class ProgramOptimize; - friend class FusionOpMatcher; - friend class Node; - - explicit OpDesc(PaddleMobile__Framework__Proto__OpDesc *op_desc); - OpDesc(const OpDesc &op_desc) : type_(op_desc.type_) { - this->inputs_ = op_desc.inputs_; - this->outputs_ = op_desc.outputs_; - this->attrs_ = op_desc.attrs_; - this->proto_attrs_ = op_desc.proto_attrs_; - } - - OpDesc() {} - const std::vector &Input(const std::string &name) const; - const std::vector &Output(const std::string &name) const; - Attribute GetAttr(const std::string &name) const; - - const std::vector - &GetProtoAttr() const; - - void SetBlockAttr(const std::string &name, BlockDesc *block); - void SetBlocksAttr(const std::string &name, std::vector block); - - VariableNameMap &GetInputs() { return inputs_; } - - VariableNameMap &GetOutputs() { return outputs_; } - - AttributeMap &GetAttrMap(); - - const std::string &Type() { return type_; } - - void SetInputs(VariableNameMap inputs) { inputs_ = inputs; } - - void SetOutputs(VariableNameMap outputs) { outputs_ = outputs; } - - void SetAttrMap(AttributeMap attrs) { attrs_ = attrs; } - - private: - std::string type_; - VariableNameMap inputs_; - VariableNameMap outputs_; - AttributeMap attrs_; - std::vector proto_attrs_; -}; - -Print &operator<<(Print &printer, const OpDesc &op_desc); - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/program-optimize/fusion_op_register.h b/mobile/src/framework/program/program-optimize/fusion_op_register.h deleted file mode 100644 index 1bf04bd6ec..0000000000 --- a/mobile/src/framework/program/program-optimize/fusion_op_register.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/node.h" - -namespace paddle_mobile { -namespace framework { - -class FusionOpRegister { - public: - static FusionOpRegister* Instance() { - static FusionOpRegister* regist = nullptr; - if (regist == nullptr) { - regist = new FusionOpRegister(); - } - return regist; - } - - void regist(FusionOpMatcher* matcher) { - if (matchers_.find(matcher->Type()) != matchers_.end()) { - return; - } - - std::shared_ptr shared_matcher(matcher); - matchers_[matcher->Type()] = shared_matcher; - } - - const std::vector> Matchers() { - std::vector> matchers; - for (const auto& match : matchers_) { - matchers.push_back(match.second); - } - std::sort(matchers.begin(), matchers.end(), - [](std::shared_ptr first, - std::shared_ptr second) { - return first->BeginNode().Depth() > second->BeginNode().Depth(); - }); - return matchers; - } - - private: - std::map> matchers_; - FusionOpRegister() {} -}; - -class FusionOpRegistrar { - public: - explicit FusionOpRegistrar(FusionOpMatcher* matcher) { - FusionOpRegister::Instance()->regist(matcher); - } - void Touch() {} -}; - -} // namespace framework -} // namespace paddle_mobile - -#define REGISTER_FUSION_MATCHER(op_type, matcher) \ - static paddle_mobile::framework::FusionOpRegistrar \ - __fusion_matcher_registrar_##op_type(new matcher()); \ - int TouchFusionMatcherRegistrar_##op_type() { \ - __fusion_matcher_registrar_##op_type.Touch(); \ - return 0; \ - } diff --git a/mobile/src/framework/program/program-optimize/node.cpp b/mobile/src/framework/program/program-optimize/node.cpp deleted file mode 100644 index 68bd89b768..0000000000 --- a/mobile/src/framework/program/program-optimize/node.cpp +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/program/program-optimize/node.h" -#include -#include -#include -#include "framework/operator.h" - -namespace paddle_mobile { - -namespace framework { - -std::vector Node::operator[](int index) { - std::vector nodes; - GetNodesWithLocation(index, 0, &nodes); - return nodes; -} - -void Node::GetNodesWithLocation(int index, int now_index, - std::vector *nodes) { - if (index == now_index) { - nodes->push_back(this); - } - - for (int i = 0; i < this->outputs_.size(); ++i) { - this->outputs_[i]->GetNodesWithLocation(index, now_index + 1, nodes); - } -} - -Node &Node::operator>(std::shared_ptr node) { - outputs_.push_back(node); - node->inputs_.push_back(this); - return *node; -} - -bool Node::operator==(const Node &in) { - if (in.type_ == this->type_) { - if (this->outputs_.size() == in.outputs_.size()) { - for (int i = 0; i < outputs_.size(); ++i) { - if (!(this->outputs_[i]->MedianEqual(*in.outputs_[i]))) { - return false; - } - } - } else { - return false; - } - } else { - return false; - } - return true; -} - -bool Node::MedianEqual(const Node &in) { - if (in.type_ == this->type_) { - if (this->outputs_.size() == in.outputs_.size()) { - // if (this->inputs_.size() != in.inputs_.size()) { - // DLOG << " == - this input size: " << this->inputs_.size(); - // DLOG << " == - ptr of this " << this; - // DLOG << " == - in input size: " << in.inputs_.size(); - // DLOG << " == - input size not equal "; - // return false; - // } else { - // for (int i = 0; i < this->inputs_.size(); ++i) { - // if (this->inputs_[i]->type_ != in.inputs_[i]->type_) { - // DLOG << " == - input type not equal "; - // return false; - // } - // } - // } - - for (int i = 0; i < outputs_.size(); ++i) { - if (!((*outputs_[i]).MedianEqual(*in.outputs_[i]))) { - return false; - } - } - } else { - // DLOG << " == - output size not equal "; - return false; - } - } else { - // DLOG << " == - median type is not equal "; - return false; - } - return true; -} - -std::map Node::Relationship() { - std::map map; - RelationshipPrivate(&map); - return map; -} - -void Node::RelationshipPrivate(std::map *map) { - for (auto output : op_desc_->outputs_) { - for (auto output_key : output.second) { - (*map)[output_key] = this; - } - } - for (auto output : this->outputs_) { - output->RelationshipPrivate(map); - } -} - -std::shared_ptr Node::To(int size) { - std::shared_ptr node = std::make_shared(); - this->To(size - 1, node); - return node; -} - -void Node::To(int index, std::shared_ptr node) { - node->op_desc_ = this->op_desc_; - node->type_ = this->type_; - node->inputs_ = this->inputs_; - if (index != 0) { - } else { - return; - } - - for (int j = 0; j < this->outputs_.size(); ++j) { - std::shared_ptr sub_node = std::make_shared(); - node->outputs_.push_back(sub_node); - outputs_[j]->To(index - 1, sub_node); - } -} - -int Node::Depth(int begin) { - int depth = 0; - begin++; - for (int i = 0; i < outputs_.size(); ++i) { - int output_depth = outputs_[i]->Depth(begin); - depth = output_depth > depth ? output_depth : depth; - } - return begin > depth ? begin : depth; -} - -Node &Node::Folder( - int size, std::string type, - std::map>> - change, - std::vector> *removed_nodes) { - std::shared_ptr op_desc = - std::make_shared(); - op_desc->inputs_ = this->op_desc_->inputs_; - std::vector> outputs; - this->Folder(op_desc, &outputs, size - 1, &change, this, removed_nodes); - this->outputs_ = outputs; - this->type_ = type; - this->op_desc_ = op_desc; - this->op_desc_->type_ = type; - return *this; -} - -void Node::Folder( - std::shared_ptr op_desc, - std::vector> *outputs, int index, - std::map>> - *change, - Node *begin_node, std::vector> *removed_nodes) { - if (change->find(this->type_) != change->end()) { - auto change_pairs = (*change)[this->type_]; - for (const auto &change_pair : change_pairs) { - std::map f; - if (this->op_desc_->GetInputs().find(change_pair.first) != - this->op_desc_->GetInputs().end()) { - if (op_desc->GetInputs().find(change_pair.second) != - op_desc->GetInputs().end()) { - for (auto value : this->op_desc_->GetInputs()[change_pair.first]) { - op_desc->GetInputs()[change_pair.second].push_back(value); - } - } else { - op_desc->GetInputs()[change_pair.second] = - this->op_desc_->GetInputs()[change_pair.first]; - } - } - } - } - - for (auto &attr_pair : this->op_desc_->attrs_) { - op_desc->attrs_.emplace(attr_pair.first, attr_pair.second); - } - if (index > 0) { - --index; - - for (auto output : outputs_) { - if (change->find(this->type_) != change->end()) { - auto change_pairs = (*change)[this->type_]; - for (const auto &change_pair : change_pairs) { - std::map f; - if (this->op_desc_->GetOutputs().find(change_pair.first) != - this->op_desc_->GetOutputs().end()) { - if (op_desc->GetInputs().find(change_pair.second) != - op_desc->GetInputs().end()) { - for (auto value : - this->op_desc_->GetOutputs()[change_pair.first]) { - op_desc->GetInputs()[change_pair.second].push_back(value); - } - } else { - op_desc->GetInputs()[change_pair.second] = - this->op_desc_->GetOutputs()[change_pair.first]; - } - } - } - } - - removed_nodes->push_back(output); - output->Folder(op_desc, outputs, index, change, begin_node, - removed_nodes); - } - } else { - for (auto &op_output : this->op_desc_->outputs_) { - auto output_key = op_output.first; - if (change->find(this->type_) != change->end()) { - const auto change_pairs = (*change)[this->type_]; - for (const auto &target : change_pairs) { - if (target.first == output_key) { - output_key = target.second; - } - } - } - op_desc->outputs_.emplace(output_key, op_output.second); - } - - for (auto &output : this->outputs_) { - auto iter = - std::find(output->inputs_.begin(), output->inputs_.end(), this); - - if (iter != output->inputs_.end()) { - output->inputs_.erase(iter); - } - output->inputs_.push_back(begin_node); - outputs->push_back(output); - } - } -} -#ifdef PADDLE_MOBILE_DEBUG -std::string Node::ToString(std::string blank, const Node *node) const { - std::stringstream ss; - ss << type_ << "-> \n"; - - if (inputs_.size() > 1 && node != inputs_.back()) { - return ss.str(); - } else if (inputs_.size() > 1 && node == inputs_.back()) { - ss << "\n" << blank << type_ << "\n"; - } - - for (int i = 0; i < outputs_.size(); ++i) { - ss << blank << outputs_[i]->ToString(blank + " ", this) << ""; - } - return ss.str(); -} - -std::string Node::ToString() const { return this->ToString(" ", this); } - -void Node::Description() { - if (op_desc_.get()) { - DLOG << *op_desc_; - } else { - DLOG << " null "; - } -} - -Print &operator<<(Print &printer, const Node &node) { - printer << node.ToString(); - return printer; -} -#endif - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/program-optimize/node.h b/mobile/src/framework/program/program-optimize/node.h deleted file mode 100644 index 5b5ae7796f..0000000000 --- a/mobile/src/framework/program/program-optimize/node.h +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "common/log.h" -#include "framework/program/op_desc.h" - -namespace paddle_mobile { -namespace framework { - -class Node { - friend class ProgramOptimize; - - public: - Node() {} - explicit Node(const std::string &type) : type_(type) {} - explicit Node(std::shared_ptr op_desc) - : op_desc_(op_desc), type_(op_desc->Type()) {} - Node &operator>(std::shared_ptr node); - bool operator==(const Node &in); - bool MedianEqual(const Node &in); - -#ifdef PADDLE_MOBILE_DEBUG - std::string ToString() const; - void Description(); -#endif - std::shared_ptr To(int size); - int Depth(int begin = 0); - Node &Folder( - int size, std::string type, - std::map>> - change, - std::vector> *removed_nodes); - std::shared_ptr OpDescOfNode() { return op_desc_; } - std::string Type() { return type_; } - - std::vector operator[](int index); - - std::map Relationship(); - - private: - void RelationshipPrivate(std::map *map); - void GetNodesWithLocation(int index, int now_index, - std::vector *nodes); - void To(int index, std::shared_ptr); - void Folder( - std::shared_ptr op_desc, - std::vector> *outputs, int index, - std::map>> - *change, - Node *begin_node, std::vector> *removed_nodes); - std::shared_ptr op_desc_; -#ifdef PADDLE_MOBILE_DEBUG - std::string ToString(std::string blank, const Node *node) const; -#endif - std::vector> outputs_; - std::vector inputs_; - std::string type_; -}; - -Print &operator<<(Print &printer, const Node &node); -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/program-optimize/program_optimize.cpp b/mobile/src/framework/program/program-optimize/program_optimize.cpp deleted file mode 100644 index eba27314ad..0000000000 --- a/mobile/src/framework/program/program-optimize/program_optimize.cpp +++ /dev/null @@ -1,300 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/program/program-optimize/program_optimize.h" -#include -#include -#include "framework/program/program-optimize/fusion_op_register.h" - -namespace paddle_mobile { - -namespace framework { - -std::shared_ptr ProgramOptimize::FusionOptimize( - std::shared_ptr ori_des, bool add_split) { - std::shared_ptr optimize_program = - std::make_shared(*ori_des); - current_block_ = optimize_program->Blocks().size(); - - for (int i = 0; i < optimize_program->Blocks().size(); ++i) { - std::unordered_map> output_nodes; - std::unordered_map< - std::string, - std::vector< - std::pair, - std::unordered_map>>>> - type_map; - std::vector> nodes; - std::shared_ptr begin_node; - - auto block = optimize_program->Block(i); - for (int j = 0; j < block->Ops().size(); ++j) { - auto op = block->Ops()[j]; - std::shared_ptr node = std::make_shared(op); - if (j == 0) { - begin_node = node; - } - - const std::string op_type = op->Type(); - nodes.push_back(node); - type_map[op_type].push_back({node, output_nodes}); - const VariableNameMap &op_inputs = op->GetInputs(); - const VariableNameMap &op_outpus = op->GetOutputs(); - - for (const auto &input : op_inputs) { - for (const auto &input_name : input.second) { - if (output_nodes.find(input_name) != output_nodes.end()) { - auto input_node = output_nodes[input_name]; - *input_node > node; - } - } - } - - for (const auto &output : op_outpus) { - for (const auto &output_name : output.second) { - output_nodes[output_name] = node; - } - } - } - - for (auto ®isted : FusionOpRegister::Instance()->Matchers()) { - std::string fusion_type = registed->Type(); - std::shared_ptr matcher = registed; - - auto match_vector = type_map[matcher->BeginType()]; - - for (auto &match_node_pair : match_vector) { - auto match_node = match_node_pair.first; - - auto node_has = match_node_pair.second; - - auto depth = matcher->BeginNode().Depth(); - auto sub_node = match_node->To(depth); - // DLOG << " sub node: " << *sub_node; - if (*sub_node == matcher->BeginNode()) { - bool can_folder = true; - - auto relationship_map = sub_node->Relationship(); - - for (auto to_check : matcher->NeedCheck()) { - auto nodes = (*sub_node)[to_check.first]; - for (auto node : nodes) { - auto inputs_to_check = - node->OpDescOfNode()->Input(to_check.second); - - for (auto input_to_check : inputs_to_check) { - if (node_has.find(input_to_check) == node_has.end()) { - if (relationship_map.find(input_to_check) == - relationship_map.end()) { - can_folder = false; - } else { - } - } - } - } - } - - if (!can_folder) { - continue; - } - - std::vector> removed_nodes; - matcher->FolderNodes(match_node.get(), &removed_nodes); - for (int k = removed_nodes.size() - 1; k >= 0; --k) { - auto removed_node = removed_nodes[k]; - auto removed_ite = - std::find(nodes.begin(), nodes.end(), removed_node); - if (removed_ite != nodes.end()) { - nodes.erase(removed_ite); - } - } - } - } - } - - std::vector> op_descs; - if (add_split) { - GenerateOps(&op_descs, begin_node.get(), add_split); - } else { - for (int m = 0; m < nodes.size(); ++m) { - auto &node = nodes[m]; - op_descs.push_back(node->op_desc_); - } - } - block->ops_ = op_descs; - } - - for (int m = 0; m < new_blocks_.size(); ++m) { - std::shared_ptr new_block = new_blocks_[m]; - new_block->index_ = m + ori_des->blocks_.size(); - optimize_program->blocks_.push_back(new_block); - } - return optimize_program; -} - -void ProgramOptimize::GenerateOps( - std::vector> *op_desc, Node *input_node, - Node *current_node) { - if (current_node->inputs_.size() > 1 && - input_node != current_node->inputs_.back()) { - DLOG << " current type " << current_node->Type(); - - DLOG << " inputs size of current node > 0 "; - - for (int i = 0; i < current_node->inputs_.size(); ++i) { - DLOG << " input i: " << current_node->inputs_[i]->Type(); - } - - return; - } else if (current_node->inputs_.size() > 1 && - input_node == current_node->inputs_.back()) { - op_desc->push_back(current_node->op_desc_); - } else { - op_desc->push_back(current_node->op_desc_); - } - - for (int i = 0; i < current_node->outputs_.size(); ++i) { - auto &output = current_node->outputs_[i]; - GenerateOps(op_desc, current_node, output.get()); - } -} - -void ProgramOptimize::GenerateOps( - std::vector> *op_desc, Node *input_node, - Node *current_node, bool adding_thread, int thread_num, - std::shared_ptr new_block) { - if (current_node->outputs_.size() > 1) { - adding_thread = false; - } - - bool can_add_split = false; - const auto current_desc = current_node->OpDescOfNode(); - const VariableNameMap ¤t_op_inputs = current_desc->GetInputs(); - const VariableNameMap ¤t_op_outputs = current_desc->GetOutputs(); - // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持 - if (current_node->outputs_.size() > 1 && current_op_outputs.size() == 1) { - can_add_split = true; - - // 遍历当前节点的 output 节点 - for (const auto &output : current_node->outputs_) { - // 不支持 output 有多个 output 的情况 - if (output->outputs_.size() > 1) { - DLOG << "don't support multi output of output"; - can_add_split = false; - break; - } - - //与节点关联的 OpDesc - std::shared_ptr &op_desc = output->op_desc_; - //获取这个 op 的 inputs key 和 outputs key - const VariableNameMap &op_inputs = op_desc->GetInputs(); - const VariableNameMap &op_outputs = op_desc->GetOutputs(); - - //判断现在 是否存在这个 op - //判断这个 output 和 input key 的 size 等于 1 - if (op_outputs.size() == 1 && op_inputs.size() == 1) { - auto inputs_of_output = op_inputs.begin()->second; - auto outputs_of_output = op_outputs.begin()->second; - - // 判断一下, 如果输入和输出没有同名, 是支持的 - for (int i = 0; i < inputs_of_output.size(); ++i) { - std::string input_of_output = inputs_of_output[i]; - for (int j = 0; j < outputs_of_output.size(); ++j) { - std::string output_of_output = outputs_of_output[j]; - if (input_of_output == output_of_output) { - DLOG << "output的 output 包含 input" << input_of_output; - can_add_split = false; - break; - } - } - } - } else { // 如果模型中包含没有的 op, 则不支持添加 split - DLOG << "找不到 这个 op 类型: " << output->op_desc_->Type(); - can_add_split = false; - } - } - } - - if (current_node->inputs_.size() > 1 && - input_node != current_node->inputs_.back()) { - return; - } else if (current_node->inputs_.size() > 1 && - input_node == current_node->inputs_.back()) { - new_block.reset(); - adding_thread = false; - op_desc->push_back(current_node->op_desc_); - } else { - if (new_block.get() && adding_thread) { - new_block->ops_.push_back(current_node->op_desc_); - } else { - op_desc->push_back(current_node->op_desc_); - } - } - if (adding_thread) { - Attribute attr; - attr.Set(thread_num); - current_node->op_desc_->attrs_["thread"] = attr; - } - - if (can_add_split) { - new_block = std::make_shared(); - new_block->multi_thread_ = true; - new_block->index_ = current_block_; - new_blocks_.push_back(new_block); - - adding_thread = true; - std::shared_ptr split_op_desc = std::make_shared(); - split_op_desc->type_ = G_OP_TYPE_SPLIT; - auto outputs = current_node->op_desc_->Output( - op_input_output_key[current_node->op_desc_->Type()].second[0]); - split_op_desc->inputs_ = { - {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}}; - auto &split_outputs = - split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]]; - for (const auto &output : current_node->outputs_) { - split_outputs.push_back(outputs[0]); - } - - Attribute attr; - attr.Set(current_block_); - split_op_desc->attrs_["block_id"] = attr; - - op_desc->push_back(split_op_desc); - current_block_++; - } - - for (int i = 0; i < current_node->outputs_.size(); ++i) { - auto &output = current_node->outputs_[i]; - if (can_add_split) { - GenerateOps(op_desc, current_node, output.get(), adding_thread, i, - new_block); - } else { - GenerateOps(op_desc, current_node, output.get(), adding_thread, - thread_num, new_block); - } - } -} - -void ProgramOptimize::GenerateOps( - std::vector> *op_descs, Node *begin_node, - bool can_add_split) { - if (can_add_split) { - this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr); - } else { - this->GenerateOps(op_descs, begin_node, begin_node); - } -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/program-optimize/program_optimize.h b/mobile/src/framework/program/program-optimize/program_optimize.h deleted file mode 100644 index 57b282926d..0000000000 --- a/mobile/src/framework/program/program-optimize/program_optimize.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/node.h" -#include "framework/program/program_desc.h" - -namespace paddle_mobile { - -namespace framework { -class ProgramOptimize { - public: - ProgramOptimize() {} - std::shared_ptr FusionOptimize( - std::shared_ptr ori_des, bool add_split = false); - - private: - int current_block_; - std::vector> new_blocks_; - void GenerateOps(std::vector> *op_descs, - Node *begin_node, bool can_add_split); - void GenerateOps(std::vector> *op_desc, - Node *input_node, Node *current_node); - void GenerateOps(std::vector> *op_desc, - Node *input_node, Node *current_node, bool adding_thread, - int thread_num, std::shared_ptr new_block); -}; -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/program.h b/mobile/src/framework/program/program.h deleted file mode 100644 index b6d1d96279..0000000000 --- a/mobile/src/framework/program/program.h +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "common/types.h" -#include "framework/program/program_desc.h" -#include "framework/scope.h" - -namespace paddle_mobile { -namespace framework { - -template -class Program { - public: - std::shared_ptr originProgram; - std::shared_ptr optimizeProgram; - std::shared_ptr scope; - std::string model_path; - std::string para_path; - bool combined = false; - bool quantification = false; - size_t combined_params_len; - uint8_t *combined_params_buf; - int quantification_fold = 1; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/program_desc.cpp b/mobile/src/framework/program/program_desc.cpp deleted file mode 100644 index a75bf01be1..0000000000 --- a/mobile/src/framework/program/program_desc.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "framework/operator.h" - -#include "framework/program/program_desc.h" -#include "framework/program/tensor_desc.h" - -namespace paddle_mobile { -namespace framework { - -ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) { - for (int i = 0; i < desc->n_blocks; ++i) { - blocks_.emplace_back(std::make_shared(desc->blocks[i])); - } - for (auto &block : blocks_) { - for (auto op : block->Ops()) { - for (const auto &attr : op->GetProtoAttr()) { - if (attr.type == PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK) { - size_t blk_idx = attr.block_idx; - op->SetBlockAttr(attr.name, this->MutableBlock(blk_idx)); - } else if (attr.type == - PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS) { - size_t n_blocks_idx = attr.n_blocks_idx; - int32_t *blks_idx = attr.blocks_idx; - std::vector block_descs; - for (size_t i = 0; i < n_blocks_idx; ++i) { - block_descs.push_back(this->MutableBlock(blks_idx[i])); - } - op->SetBlocksAttr(attr.name, block_descs); - } - } - } - } -} - -void ProgramDesc::Description(std::string header) const { -#ifdef PADDLE_MOBILE_DEBUG - if (header.size()) { - LOG(kLOG_INFO) << header; - } - for (int i = 0; i < this->blocks_.size(); ++i) { - auto block = this->blocks_[i]; - for (int j = 0; j < block->Ops().size(); ++j) { - std::shared_ptr op_desc = block->Ops()[j]; - auto op_info_ptr = - OpInfoMap::Instance()->GetNullable(op_desc->Type()); - if (op_info_ptr == nullptr) { - DLOG << "Operator has not been registered :" << op_desc->Type().c_str(); - } - } - } - - for (int i = 0; i < this->blocks_.size(); ++i) { - auto block = this->blocks_[i]; - LOG(kLOG_DEBUG) << "block: " << block->ID(); - LOG(kLOG_INFO) << "block ops size: " << block->Ops().size(); - for (int j = 0; j < block->Ops().size(); ++j) { - auto op = block->Ops()[j]; - LOG(kLOG_DEBUG1) << j << "th, op: " << op->Type(); - for (auto &input : op->GetInputs()) { - LOG(kLOG_DEBUG2) << "input parameter: " << input.first; - for (auto &n : input.second) { - LOG(kLOG_DEBUG3) << "argument - " << n; - } - } - for (auto &output : op->GetOutputs()) { - LOG(kLOG_DEBUG2) << "output parameter: " << output.first; - for (auto &n : output.second) { - LOG(kLOG_DEBUG3) << "argument - " << n; - } - } - for (auto &attr : op->GetAttrMap()) { - if (attr.first == "op_callstack" || attr.first == "sub_block") continue; - LOG(kLOG_DEBUG2) << "attr name: " << attr.first; - LOG(kLOG_DEBUG3) << "argument - " << attr.second; - } - } - - for (const auto &var_desc : block->Vars()) { - LOG(kLOG_DEBUG1) << "var name: " << var_desc->Name(); - if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { - const TensorDesc &tensor_desc = var_desc->Tensor_desc(); - - LOG(kLOG_DEBUG2) << "in var tensor desc dims size: " - << tensor_desc.Dims().size(); - for (int l = 0; l < tensor_desc.Dims().size(); ++l) { - LOG(kLOG_DEBUG3) << "var tensor desc dim " << l - << " value: " << tensor_desc.Dims()[l]; - } - } - } - } - - for (const auto &block : this->blocks_) { - } -#endif -} - -std::shared_ptr ProgramDesc::Block(size_t idx) { - return blocks_[idx]; -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/program_desc.h b/mobile/src/framework/program/program_desc.h deleted file mode 100644 index f4551509ee..0000000000 --- a/mobile/src/framework/program/program_desc.h +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "common/types.h" -#include "framework/framework.pb-c.h" -#include "framework/program/block_desc.h" - -namespace paddle_mobile { -namespace framework { - -class ProgramDesc { - public: - friend class Node; - friend class ProgramOptimize; - explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc); - - ProgramDesc(const ProgramDesc &program_desc) { - for (auto &block : program_desc.blocks_) { - std::shared_ptr copy_block = - std::make_shared(*block); - blocks_.push_back(copy_block); - } - } - - std::shared_ptr Block(size_t idx); - - BlockDesc *MutableBlock(size_t idx) { - if (idx == -1) { - return nullptr; - } else { - return blocks_[idx].get(); - } - } - - const std::vector> &Blocks() const { - return blocks_; - } - - void Description(std::string header = "") const; - - private: - std::vector> blocks_; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/tensor_desc.h b/mobile/src/framework/program/tensor_desc.h deleted file mode 100644 index f1634c6503..0000000000 --- a/mobile/src/framework/program/tensor_desc.h +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "framework/framework.pb-c.h" - -namespace paddle_mobile { -namespace framework { - -enum VarType_Type { - VARTYPE_TYPE_BOOL = 0, - VARTYPE_TYPE_INT16 = 1, - VARTYPE_TYPE_INT32 = 2, - VARTYPE_TYPE_INT64 = 3, - VARTYPE_TYPE_FP16 = 4, - VARTYPE_TYPE_FP32 = 5, - VARTYPE_TYPE_FP64 = 6, - VARTYPE_TYPE_LOD_TENSOR = 7, - VARTYPE_TYPE_SELECTED_ROWS = 8, - VARTYPE_TYPE_FEED_MINIBATCH = 9, - VARTYPE_TYPE_FETCH_LIST = 10, - VARTYPE_TYPE_STEP_SCOPES = 11, - VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12, - VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13, - VARTYPE_TYPE_STEP_PLACE_LIST = 14, - VARTYPE_TYPE_READER = 15, - VARTYPE_TYPE_CHANNEL = 16, - VARTYPE_TYPE_RAW = 17, - VARTYPE_TYPE_TUPLE = 18, - VARTYPE_TYPE_SIZE_T = 19, - VARTYPE_TYPE_UINT8 = 20, - VARTYPE_TYPE_INT8 = 21, -}; - -class TensorDesc { - public: - TensorDesc() = default; - TensorDesc(const TensorDesc &desc) { - this->dims_ = desc.dims_; - this->data_type_ = desc.data_type_; - } - - TensorDesc(PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) { - for (int i = 0; i < desc->n_dims; ++i) { - int64_t d = desc->dims[i]; - dims_.emplace_back(d); - } - data_type_ = (VarType_Type)desc->data_type; - } - // return tensor dim as a vector - std::vector Dims() const { return dims_; }; - // return tensor data type - VarType_Type DataType() const { return data_type_; } - - private: - std::vector dims_; - VarType_Type data_type_; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/program/var_desc.h b/mobile/src/framework/program/var_desc.h deleted file mode 100644 index ede7263a72..0000000000 --- a/mobile/src/framework/program/var_desc.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "framework/framework.pb-c.h" -#include "framework/program/tensor_desc.h" - -namespace paddle_mobile { -namespace framework { - -class VarDesc { - public: - VarDesc(const VarDesc &var_desc) { - this->data_type_ = var_desc.data_type_; - this->name_ = var_desc.name_; - this->persistable_ = var_desc.persistable_; - this->tensor_desc_ = var_desc.tensor_desc_; - this->type_ = var_desc.type_; - } - - VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) { - type_ = (VarType_Type)desc->type->type; - name_ = std::string(desc->name); - persistable_ = (bool)desc->persistable; - - switch (type_) { - case VARTYPE_TYPE_SELECTED_ROWS: - tensor_desc_ = TensorDesc(desc->type->selected_rows); - break; - case VARTYPE_TYPE_LOD_TENSOR: - tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor); - break; - case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY: - tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor); - break; - default: - break; - } - switch (type_) { - case VARTYPE_TYPE_CHANNEL: - data_type_ = (VarType_Type)desc->type->channel->data_type; - break; - default: - data_type_ = tensor_desc_.DataType(); - break; - } - } - - std::string Name() const { return name_; } - - VarType_Type Type() const { return type_; } - - bool Persistable() const { return persistable_; } - - const TensorDesc &Tensor_desc() const { return tensor_desc_; } - - private: - std::string name_; - bool persistable_; - TensorDesc tensor_desc_; - VarType_Type type_; - VarType_Type data_type_; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/scope.cpp b/mobile/src/framework/scope.cpp deleted file mode 100644 index e60148f3c6..0000000000 --- a/mobile/src/framework/scope.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/scope.h" - -#include -#include -#include -#include - -namespace paddle_mobile { -namespace framework { - -Scope &Scope::NewScope() const { - kids_.push_back(new Scope(this)); - return *kids_.back(); -} - -Variable *Scope::Var() { - auto *pvar = new Variable; - unnamed_vars_.push_back(pvar); - return pvar; -} - -Variable *Scope::Var(const std::string &name) { - auto *pvar = FindVarLocally(name); - if (pvar != nullptr) { - return pvar; - } - pvar = new Variable; - named_vars_[name] = pvar; - pvar->name_ = named_vars_.find(name)->first; - return pvar; -} - -Variable *Scope::FindVar(const std::string &name) const { - auto *pvar = FindVarLocally(name); - if (pvar != nullptr) { - return pvar; - } - return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); -} - -const Scope *Scope::FindScope(const Variable *var) const { - for (auto &name_var : named_vars_) { - if (name_var.second == var) { - return this; - } - } - return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); -} - -void Scope::DropKids() { - for (Scope *s : kids_) { - delete s; - } - kids_.clear(); -} - -std::vector Scope::LocalVarNames() const { - std::vector known_vars; - known_vars.reserve(named_vars_.size()); - for (auto &name_var : named_vars_) { - known_vars.emplace_back(name_var.first); - } - return known_vars; -} - -void Scope::DeleteScope(Scope *scope) const { - auto it = std::find(kids_.begin(), kids_.end(), scope); - kids_.erase(it); - delete scope; -} - -void Scope::EraseVars(const std::vector &var_names) { - std::set var_set(var_names.begin(), var_names.end()); - for (auto it = named_vars_.begin(); it != named_vars_.end();) { - if (var_set.find(it->first) != var_set.end()) { - delete it->second; - it = named_vars_.erase(it); - } else { - ++it; - } - } -} - -void Scope::Rename(const std::string &origin_name, - const std::string &new_name) const { - auto origin_it = named_vars_.find(origin_name); - if (origin_it == named_vars_.end()) { - return; - } - auto new_it = named_vars_.find(new_name); - if (new_it != named_vars_.end()) { - return; - } - named_vars_[new_name] = origin_it->second; - named_vars_.erase(origin_it); -} - -Variable *Scope::FindVarLocally(const std::string &name) const { - auto it = named_vars_.find(name); - if (it != named_vars_.end()) { - return it->second; - } - return nullptr; -} - -#ifdef PADDLE_MOBILE_FPGA -Variable *Scope::Var(const std::string &name, const int id) { - return Var(name + std::to_string(id)); -} - -std::vector Scope::VarContain(const std::string substring, - int *min) { - std::vector v; - - int temp = 9999; - auto len0 = substring.length(); - for (auto pair : named_vars_) { - if (pair.first.find(substring) == 0) { - v.push_back(pair.second); - auto len1 = pair.first.length(); - int index = std::stoi(pair.first.substr(len0, len1)); - if (index < temp) { - temp = index; - } - } - } - *min = temp; - return v; -} - -void Scope::print_vars() { - DLOG << "====================start to print variables================="; - for (auto pair : named_vars_) { - DLOG << pair.first; - } - DLOG << "==================complete printing variables================"; -} -#endif - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/scope.h b/mobile/src/framework/scope.h deleted file mode 100644 index 47642cc3f1..0000000000 --- a/mobile/src/framework/scope.h +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#ifdef PADDLE_MOBILE_CL -#include "framework/cl/cl_scope.h" -#endif -#include "framework/variable.h" - -namespace paddle_mobile { -namespace framework { - -class Scope { - public: - Scope() = default; - - ~Scope() { - // clear named variables - for (auto &var : named_vars_) { - delete var.second; - } - named_vars_.clear(); - // clear unnamed variables - for (auto &var : unnamed_vars_) { - delete var; - } - unnamed_vars_.clear(); - DropKids(); - -#ifdef PADDLE_MOBILE_CL - delete cl_scope_; -#endif - } - - Scope &NewScope() const; - - /// Create a variable without name if it doesn't exist. - Variable *Var(); - - /// Create a variable with given name if it doesn't exist. - Variable *Var(const std::string &name); - - void EraseVars(const std::vector &var_names); - - /// Find a variable in the scope or any of its ancestors. Returns - /// nullptr if cannot find. - Variable *FindVar(const std::string &name) const; - - const Scope *parent() const { return parent_; } - - /// Find the scope or an ancestor scope that contains the given - /// variable. - const Scope *FindScope(const Variable *var) const; - - void DeleteScope(Scope *scope) const; - - /// Drop all kids scopes belonged to this scope. - void DropKids(); - - // enumerate all the variables current contains. - std::vector LocalVarNames() const; - - // Rename variable to a new name - void Rename(const std::string &origin_name, - const std::string &new_name) const; - - // Rename variable to a new name and return the new name - std::string Rename(const std::string &origin_name) const; - - Variable *FindVarLocally(const std::string &name) const; - -#ifdef PADDLE_MOBILE_FPGA - Variable *Var(const std::string &name, const int id); - std::vector VarContain(const std::string substring, int *min); - void print_vars(); -#endif - -#ifdef PADDLE_MOBILE_CL - CLScope *GetCLScpoe() { return cl_scope_; } -#endif - - private: - // Call Scope::NewScope for a sub-scope. - explicit Scope(Scope const *parent) : parent_(parent) {} - - mutable std::unordered_map named_vars_; - mutable std::vector unnamed_vars_; - mutable std::list kids_; - Scope const *parent_{nullptr}; - -#ifdef PADDLE_MOBILE_CL - CLScope *cl_scope_ = new CLScope(); -#endif -}; -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/selected_rows.cpp b/mobile/src/framework/selected_rows.cpp deleted file mode 100644 index 96e72051e5..0000000000 --- a/mobile/src/framework/selected_rows.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/selected_rows.h" - -namespace paddle_mobile { -namespace framework { - -struct ReAllocateVisitor { - ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims) - : tensor_(tensor), dims_(dims) {} - - template - void operator()() const { - framework::Tensor cpu_tensor; - T* ptr = cpu_tensor.mutable_data(dims_); - const T* old_ptr = - tensor_->memory_size() == 0 ? nullptr : tensor_->data(); - if (old_ptr != nullptr) { - std::copy(old_ptr, old_ptr + tensor_->numel(), ptr); - } - tensor_->ShareDataWith(cpu_tensor); - } - - framework::Tensor* tensor_; - framework::DDim dims_; -}; -// TensorCopyVisitor(value, i * value_width, *value_.get(), -// index * value_width, value_width)); -struct TensorCopyVisitor { - TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset, - const framework::Tensor src, int64_t src_offset, - int64_t size) - : dst_(dst), - dst_offset_(dst_offset), - src_(src), - src_offset_(src_offset), - size_(size) {} - - template - void operator()() const { - // TODO(Yancey1989): support other place - memory::Copy(dst_->mutable_data() + dst_offset_, - src_.data() + src_offset_, size_ * sizeof(T)); - } - - framework::Tensor* dst_; - int64_t dst_offset_; - framework::Tensor src_; - int64_t src_offset_; - int64_t size_; -}; - -bool SelectedRows::HasKey(int64_t key) const { - return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false - : true; -} - -// std::vector SelectedRows::Get(std::vector keys, -// framework::Tensor* value) const { -// PADDLE_MOBILE_ENFORCE(value->IsInitialized(), -// "The value tensor should be initialized."); -// std::vector non_keys; -// int64_t value_width = value_->numel() / value_->dims()[0]; -// PADDLE_MOBILE_ENFORCE(value_width == value->numel() / value->dims()[0], -// "output tensor should have the same shape with table " -// "execpt the dims[0]."); -// -// for (size_t i = 0; i < keys.size(); ++i) { -// int64_t index = Index(keys[i]); -// if (index == -1) { -// non_keys.push_back(keys[i]); -// } else { -// framework::VisitDataType( -// framework::ToDataType(value_->type()), -// TensorCopyVisitor(value, i * value_width, *value_.get(), -// index * value_width, value_width)); -// } -// } -// return non_keys; -//} - -// bool SelectedRows::Set(int64_t key, const framework::Tensor& value) { -// PADDLE_MOBILE_ENFORCE(value.IsInitialized(), "The value should be -// initialized."); if (value_->IsInitialized()) { -// PADDLE_MOBILE_ENFORCE( -// value.type() == value_->type(), -// "The type of the value should be same with the original value"); -// } -// PADDLE_MOBILE_ENFORCE(value.dims()[0] == static_cast(1), -// "The first dim of value should be 1."); -// auto index = Index(key); -// bool is_new_key = false; -// if (index == -1) { -// rows_.push_back(key); -// index = rows_.size() - 1; -// is_new_key = true; -// // whether need to resize the table -// if (static_cast(rows_.size()) > value_->dims()[0]) { -// auto dims = value_->dims(); -// dims[0] = (dims[0] + 1) << 1; -// framework::VisitDataType(framework::ToDataType(value.type()), -// ReAllocateVisitor(value_.get(), dims)); -// } -// } -// -// framework::VisitDataType( -// framework::ToDataType(value.type()), -// TensorCopyVisitor(value_.get(), -// index * value_->numel() / value_->dims()[0], value, -// static_cast(0), value.numel())); -// return is_new_key; -//} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/selected_rows.h b/mobile/src/framework/selected_rows.h deleted file mode 100644 index db49bd9115..0000000000 --- a/mobile/src/framework/selected_rows.h +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "framework/lod_tensor.h" -#include "framework/mixed_vector.h" -#include "framework/tensor.h" -#include "memory/t_malloc.h" - -namespace paddle_mobile { -namespace framework { - -class SelectedRows { - /* - * @brief We can use the SelectedRows structure to reproduce a sparse table. - * A sparse table is a key-value structure that the key is an `int64_t` - * number, - * and the value is a Tensor which the first dimension is 0. - * You can use the following interface to operate the sparse table, and you - * can find - * some detail information from the comments of each interface: - * - * HasKey(key), whether the sparse table has the specified key. - * Set(key, value), set a key-value pair into the sparse table. - * Get(keys, value*), get value by given key list and apply it to the given - * value pointer - * with the specified offset. - * - */ - public: - SelectedRows(const std::vector& rows, const int64_t& height) - : rows_(rows), height_(height) { - value_.reset(new Tensor()); - } - - SelectedRows() { - height_ = 0; - value_.reset(new Tensor()); - } - - // platform::Place place() const { return value_->place(); } - - const Tensor& value() const { return *value_; } - - Tensor* mutable_value() { return value_.get(); } - - int64_t height() const { return height_; } - - void set_height(int64_t height) { height_ = height; } - - const Vector& rows() const { return rows_; } - - Vector* mutable_rows() { return &rows_; } - - void set_rows(const Vector& rows) { rows_ = rows; } - - /* - * @brief wheter has the specified key in the table. - * - * @return true if the key is exists. - */ - bool HasKey(int64_t key) const; - - /* - * @brief Get value by the key list, if the - * - * @return a list of keys which does not exists in table - */ - std::vector Get(std::vector keys, - framework::Tensor* tensor) const; - - /* - * @brief Set a key-value pair into the table. - * This function will double the value memory if it's not engouth. - * - * @note: - * 1. The first dim of the value should be 1 - * 2. The value should be initialized and the data type - * should be the same with the table. - * - * @return true if the key is a new one, otherwise false - * - */ - bool Set(int64_t key, const Tensor& value); - - /* - * @brief Get the index of key in rows - * - * @return -1 if the key does not exists. - */ - int64_t Index(int64_t key) const { - auto it = std::find(rows_.begin(), rows_.end(), key); - if (it == rows_.end()) { - return static_cast(-1); - } - return static_cast(std::distance(rows_.begin(), it)); - } - - DDim GetCompleteDims() const { - std::vector dims = vectorize(value_->dims()); - dims[0] = height_; - return make_ddim(dims); - } - - private: - // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. - // SelectedRows are simply concated when adding together. Until a - // SelectedRows add a Tensor, will the duplicate rows be handled. - Vector rows_; - std::unique_ptr value_{nullptr}; - int64_t height_; -}; - -/* - * Serialize/Desiralize SelectedRows to std::ostream - * You can pass ofstream or ostringstream to serilize to file - * or to a in memory string. GPU tensor will be copied to CPU. - */ -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); -void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows); - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/tensor.h b/mobile/src/framework/tensor.h deleted file mode 100644 index 7cab1408da..0000000000 --- a/mobile/src/framework/tensor.h +++ /dev/null @@ -1,355 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "common/enforce.h" -#include "framework/data_layout.h" -#include "framework/tensor_base.h" -#include "memory/t_malloc.h" - -#ifdef PADDLE_MOBILE_FPGA_KD -#include "framework/zynqmp/ztensor.hpp" -#endif - -#ifndef PADDLE_MOBILE_FPGA_KD - -namespace paddle_mobile { -namespace framework { - -enum LayoutType { - LAYOUT_CHW = 1, - LAYOUT_HWC = 0, -}; - -class LoDTensor; - -class Tensor : public TensorBase { - public: - Tensor() {} - template - Tensor(std::vector input, DDim ddim) { - PADDLE_MOBILE_ENFORCE( - input.size() == framework::product(ddim), - "input vector'length should be equal to tensor's length"); - - auto input_ptr = mutable_data(ddim); - for (int i = 0; i < input.size(); ++i) { - input_ptr[i] = input[i]; - } - } - - template - Tensor(T *input, DDim ddim) { - // input pointer is allocated by external sources. can't calculate its - // length. PADDLE_MOBILE_ENFORCE( - // (sizeof(input) / sizeof(input[0])) == framework::product(ddim), - // "input vector'length should be equal to tensor's length"); - - Resize(ddim); - auto type = type_id().hash_code(); - int64_t size = numel() * SizeOfType(type); - holder_.reset( - new PlaceholderImpl(size, type, reinterpret_cast(input))); - holder_->set_type(type); - offset_ = 0; - } - - Tensor(const Tensor &inTensor) { - this->dims_ = inTensor.dims_; - this->holder_ = inTensor.holder_; - this->offset_ = inTensor.offset_; - } - - /*! Resize the dimensions of the memory block. */ - inline Tensor &Resize(const DDim &dims) { - dims_ = dims; - return *this; - } - - /*! The internal of two tensors share the same memory block. */ - inline Tensor &ShareDataWith(const Tensor &src) { - src.check_memory_size(); - if (holder_.get() != src.holder_.get() || dims_ != src.dims()) { - *this = src; - } - return *this; - } - - /*! The internal of two tensors share the same memory block. */ - inline Tensor &ShareHolderWith(const Tensor &src) { - src.check_memory_size(); - if (holder_.get() != src.holder_.get()) { - holder_ = src.holder_; - } - return *this; - } - - template - inline T *mutable_data_new() { - static_assert(std::is_pod::value, "T must be POD"); - const kTypeId_t type = type_id().hash_code(); - - if (holder_ != nullptr) { - holder_->set_type(type); - } - - PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") - int64_t size = numel() * SizeOfType(type); - if (holder_ == nullptr || holder_->size() != size + offset_) { - if (holder_ == nullptr) { - holder_.reset(new PlaceholderImpl(size, type)); - } else { - holder_->realloc(size); - } - offset_ = 0; - } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } - - inline void *mutable_data(const kTypeId_t type) { - if (holder_ != nullptr) { - holder_->set_type(type); - } - PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") - int64_t size = numel() * SizeOfType(type); - if (holder_ == nullptr || holder_->size() < size + offset_) { - if (holder_ == nullptr) { - holder_.reset(new PlaceholderImpl(size, type)); - } else { - holder_->resize(size); - } - offset_ = 0; - } - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } - - /** - * @brief Return a pointer to mutable memory block. - * @note If not exist, then allocation. - */ - template - inline T *mutable_data() { - static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data(type_id().hash_code())); - } - - /** - * @brief Return a pointer to mutable memory block. - * - * @param[in] dims The dimensions of the memory block. - * @param[in] place The place of the memory block. - * - * @note If not exist, then allocation. - */ - template - inline T *mutable_data(DDim dims) { - static_assert(std::is_pod::value, "T must be POD"); - Resize(dims); - return mutable_data(); - } - - /** - * @brief Return a sub-tensor of the given tensor. - * - * @param[in] begin_idx The index of the start row(inclusive) to - * slice. - * The index number begins from 0. - * @param[in] end_idx The index of the end row(exclusive) to - * slice. - * The index number begins from 0. - */ - inline Tensor Slice(int begin_idx, int end_idx) const { - check_memory_size(); - PADDLE_MOBILE_ENFORCE(begin_idx >= 0, - "The start row index must be greater than 0.") - PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0], - "The end row index is out of bound.") - PADDLE_MOBILE_ENFORCE( - begin_idx < end_idx, - "The start row index must be lesser than the end row index") - if (dims_[0] == 1) { - return *this; - } else { - size_t base = numel() / dims_[0]; - Tensor dst; - dst.holder_ = holder_; - DDim dst_dims = dims_; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); - return dst; - } - } - - /*! Return a pointer to mutable memory block. */ - template - inline T *data() { - check_memory_size(); - PADDLE_MOBILE_ENFORCE( - (std::is_same::value || - holder_->type() == type_id().hash_code()), - "Tensor holds the wrong type, it holds %d, requested %d", - this->holder_->type(), type_id().hash_code()); - - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } - - /*! Return a pointer to constant memory block. */ - template - inline const T *data() const { - check_memory_size(); - PADDLE_MOBILE_ENFORCE( - (std::is_same::value || - holder_->type() == type_id().hash_code()), - "Tensor holds the wrong type, it holds %d, requested %d", - this->holder_->type(), type_id().hash_code()); - - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } - - private: - struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(size_t size, const kTypeId_t type) - : ptr_(static_cast(memory::Alloc(size)), - [](uint8_t *ptr) { memory::PODDeleter()(ptr); }), - size_(size), - capatity_(size), - type_(type) { - PADDLE_MOBILE_ENFORCE(ptr_ != nullptr, - "Insufficient memory to allocation"); - } - - PlaceholderImpl(size_t size, const kTypeId_t type, uint8_t *ptr) - : ptr_(ptr, [](uint8_t *ptr) {}), - size_(size), - capatity_(size), - type_(type) { - PADDLE_MOBILE_ENFORCE(ptr_ != nullptr, - "Insufficient memory to allocation"); - } - - virtual size_t size() const { return size_; } - - virtual void *ptr() const { return static_cast(ptr_.get()); } - - virtual kTypeId_t type() const { return type_; } - - virtual void set_type(const kTypeId_t type) { type_ = type; } - - virtual void resize(size_t size) { - if (size > capatity_) { - capatity_ = size; - ptr_.reset(static_cast(memory::Alloc(capatity_))); - } - size_ = size; - } - - virtual void realloc(size_t size) { - capatity_ = size; - ptr_.reset(static_cast(memory::Alloc(capatity_))); - size_ = size; - } - - std::unique_ptr> ptr_; - - /*! the size of memory block. */ - size_t size_; - - size_t capatity_; - - /* the current type of memory */ - kTypeId_t type_; - }; - -#ifdef PADDLE_MOBILE_FPGA - public: // NOLINT - inline void reset_data_ptr(void *p) { - ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT - } - inline void set_type(const kTypeId_t type) { holder_->set_type(type); } - inline void *get_data() { - return ( - void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get()); // NOLINT - } - - inline void *init(const kTypeId_t type) { - if (holder_ != nullptr) { - holder_->set_type(type); - } - PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") - int64_t size = 1 * SizeOfType(type); - if (holder_ == nullptr || holder_->size() < size + offset_) { - holder_.reset(new PlaceholderImpl(size, type)); - offset_ = 0; - } - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } - - float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX - void *external_data = nullptr; // only used for Feed - LayoutType layout = LAYOUT_HWC; - int64_t fpga_data_num; -#endif -}; - -#ifdef PADDLE_MOBILE_DEBUG -inline Print &operator<<(Print &printer, const Tensor &tensor) { - printer << " dims: " << tensor.dims() << "\n"; - int stride = tensor.numel() / 20; - stride = stride > 0 ? stride : 1; -#ifndef PADDLE_MOBILE_FPGA - for (int i = 0; i < tensor.numel(); i += stride) { - if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } else if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } else if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } else if (tensor.type() == type_id()) { - printer << static_cast(tensor.data()[i]) << " "; - } else if (tensor.type() == type_id()) { - printer << tensor.data()[i] << " "; - } - } -#endif - return printer; -} - -#endif - -inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) { - Tensor res; - res.ShareDataWith(src); - res.Resize(flatten_to_2d(src.dims(), num_col_dims)); - return res; -} - -} // namespace framework -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/framework/tensor_base.h b/mobile/src/framework/tensor_base.h deleted file mode 100644 index 97135bda39..0000000000 --- a/mobile/src/framework/tensor_base.h +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "common/enforce.h" -#include "common/type_define.h" -#include "common/types.h" -#include "framework/ddim.h" - -namespace paddle_mobile { -namespace framework { - -template -struct SizeOfTypeFunctor; - -template -struct SizeOfTypeFunctor { - size_t operator()(const kTypeId_t type) const { - if (type_id().hash_code() == type) { - return sizeof(T); - } else { - return 0UL; - } - } -}; - -template <> -struct SizeOfTypeFunctor<> { - size_t operator()(const kTypeId_t type) const { return 0UL; } -}; - -template -struct SizeOfTypeFunctor { - size_t operator()(const kTypeId_t type) const { - SizeOfTypeFunctor head; - size_t head_size = head(type); - if (head_size != 0) { - return head_size; - } - SizeOfTypeFunctor tail; - return tail(type); - } -}; - -static inline size_t SizeOfType(const kTypeId_t type) { - SizeOfTypeFunctor - functor; - size_t size = functor(type); - - PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %d", type); - return size; -} - -class TensorBase { - public: - virtual inline TensorBase &Resize(const DDim &dims) = 0; - - inline bool IsInitialized() const { return holder_ != nullptr; } - - /*! Return the dimensions of the memory block. */ - inline const DDim &dims() const { return dims_; } - - /*! Return the numel of the memory block. */ - inline int64_t numel() const { return product(dims_); } - - kTypeId_t type() const { - PADDLE_MOBILE_ENFORCE( - holder_ != nullptr, - "Tensor not initialized yet when Tensor::type() is called.") - return holder_->type(); - } - - // memory size returns the holding memory size in byte. - size_t memory_size() const { - return holder_ == nullptr ? 0UL : holder_->size() - offset_; - } - - inline void check_memory_size() const { -#ifdef PADDLE_MOBILE_FPGA - return; -#endif - PADDLE_MOBILE_ENFORCE( - holder_ != nullptr, - "Tensor holds no memory. Call Tensor::mutable_data first."); - PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(), - "Tensor's dims_ is out of bound. "); - } - - protected: - /** - * @note Placeholder hides type T, so it doesn't appear as a - * template - * parameter of Variable. - */ - struct Placeholder { - virtual ~Placeholder() = default; - - virtual void *ptr() const = 0; - - virtual size_t size() const = 0; - - virtual kTypeId_t type() const = 0; - - virtual void set_type(kTypeId_t type) = 0; - - virtual void resize(size_t size) = 0; - - virtual void realloc(size_t size) = 0; - }; - - /** - * @brief points to elements dimensions. - * - * @note dims_ do not indicate the memory block size. - */ - - DDim dims_; - - /*! holds the memory block if allocated. */ - std::shared_ptr holder_; - - /** - * @brief A PlaceHolder may be shared by more than one tensor. - * - * @note Some of them may be slices of the others. So the offset_ - * is introduced here to indicate the byte offset between - * PlaceHolder::ptr_ and where the tensor data really - * begins. - */ - size_t offset_ = 0; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/tensor_util.cpp b/mobile/src/framework/tensor_util.cpp deleted file mode 100644 index 6722ec3e37..0000000000 --- a/mobile/src/framework/tensor_util.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "tensor_util.h" - -namespace paddle_mobile { -namespace framework { - -void TensorCopy(const Tensor &src, Tensor *dst) { - src.check_memory_size(); - dst->Resize(src.dims()); - auto src_ptr = src.data(); - auto dst_ptr = dst->mutable_data(src.type()); - auto size = src.numel() * SizeOfType(src.type()); - memory::Copy(dst_ptr, src_ptr, size); -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/tensor_util.h b/mobile/src/framework/tensor_util.h deleted file mode 100644 index 31fc5148c7..0000000000 --- a/mobile/src/framework/tensor_util.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "framework/tensor.h" -#include "memory/t_malloc.h" - -namespace paddle_mobile { -namespace framework { - -void TensorCopy(const Tensor& src, Tensor* dst); - -template -void TensorFromVector(const std::vector& src, Tensor* dst); - -template -void TensorFromVector(const std::vector& src, Tensor* dst) { - auto src_ptr = static_cast(src.data()); - dst->Resize({static_cast(src.size())}); - auto dst_ptr = static_cast(dst->mutable_data()); - auto size = src.size() * sizeof(T); - - memory::Copy(dst_ptr, src_ptr, size); -} - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/type_trait.h b/mobile/src/framework/type_trait.h deleted file mode 100644 index d1a8e30522..0000000000 --- a/mobile/src/framework/type_trait.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -namespace paddle_mobile { -namespace framework { - -template -struct DtypeTensorTrait { - // This is the type we obtained in variable. - typedef framework::LoDTensor gtype; - // This type will be the parent class type - // or the same type. - typedef framework::Tensor rtype; -}; - -#ifdef PADDLE_MOBILE_CL -template <> -struct DtypeTensorTrait { - // This is the type we obtained in variable. - typedef framework::CLImage gtype; - // This type will be the parent class type - // or the same type. - typedef framework::CLImage rtype; -}; -#endif - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/variable.h b/mobile/src/framework/variable.h deleted file mode 100644 index 30486cb347..0000000000 --- a/mobile/src/framework/variable.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "common/variant.h" - -namespace paddle_mobile { -namespace framework { - -class Variable { - public: - template - const T *Get() const { - return static_cast(holder_->Ptr()); - } - - template - const T GetValue() const { - if (type_id().hash_code() == type_id().hash_code()) { - PADDLE_MOBILE_THROW_EXCEPTION( - "Please use getString to get an string (to avoid of an issue with " - "gcc " - "stl lib with string copy)"); - exit(0); - } - return variant.Get(); - } - - template - void SetValue(T value) { - variant.Set(value); - } - - bool IsInitialized() const { return holder_ != nullptr; } - - template - T *GetMutable() { - if (!IsType()) { - holder_.reset(new PlaceholderImp(new T())); - } - return static_cast(holder_->Ptr()); - } - - template - bool IsType() const { - return holder_ != nullptr && holder_->Type() == type_id().hash_code(); - } - - void Clear() { holder_.reset(); } - - kTypeId_t Type() const { return holder_->Type(); } - - private: - struct Placeholder { - Placeholder() = default; - virtual ~Placeholder() = default; - - virtual kTypeId_t Type() const = 0; - virtual void *Ptr() const = 0; - }; - - template - struct PlaceholderImp : public Placeholder { - explicit PlaceholderImp(T *ptr) - : ptr_(ptr), type_(type_id().hash_code()) {} - - kTypeId_t Type() const override { return type_; } - void *Ptr() const override { return static_cast(ptr_.get()); } - - std::unique_ptr ptr_; - kTypeId_t type_; - }; - - friend class Scope; - - Variant variant; - std::unique_ptr holder_; - std::string name_; -}; - -} // namespace framework -} // namespace paddle_mobile diff --git a/mobile/src/framework/zynqmp/ztensor.hpp b/mobile/src/framework/zynqmp/ztensor.hpp deleted file mode 100644 index d68e43b6dc..0000000000 --- a/mobile/src/framework/zynqmp/ztensor.hpp +++ /dev/null @@ -1,312 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "common/enforce.h" -#include "framework/data_layout.h" -#include "framework/tensor_base.h" -#include "memory/t_malloc.h" - -#ifdef PADDLE_MOBILE_FPGA_KD - -#include "fpga/KD/tensor.hpp" - -namespace paddle_mobile { -namespace framework { - -class LoDTensor; - -class Tensor : public TensorBase { - public: - Tensor() {} - template - Tensor(std::vector input, DDim ddim) { - PADDLE_MOBILE_ENFORCE( - input.size() == framework::product(ddim), - "input vector'length should be equal to tensor's length"); - - auto input_ptr = mutable_data(ddim); - for (int i = 0; i < input.size(); ++i) { - input_ptr[i] = input[i]; - } - } - - Tensor(const Tensor &inTensor) { - this->dims_ = inTensor.dims_; - this->holder_ = inTensor.holder_; - this->offset_ = inTensor.offset_; - } - - /*! Resize the dimensions of the memory block. */ - inline Tensor &Resize(const DDim &dims) { - dims_ = dims; - // TODO(chonwhite) resize holder? - return *this; - } - - /*! The internal of two tensors share the same memory block. */ - inline Tensor &ShareDataWith(const Tensor &src) { - src.check_memory_size(); - if (holder_.get() != src.holder_.get()) { - *this = src; - } - return *this; - } - - /*! The internal of two tensors share the same memory block. */ - inline Tensor &ShareHolderWith(const Tensor &src) { - src.check_memory_size(); - if (holder_.get() != src.holder_.get()) { - holder_ = src.holder_; - } - return *this; - } - - inline zynqmp::Tensor *zynqmpTensor() const { - PlaceholderImpl *holder = static_cast(holder_.get()); - // mutable_data(holder->type()); - return holder->tensor_; - } - - inline void *mutable_data(const kTypeId_t type) { - if (holder_ != nullptr) { - holder_->set_type(type); - } - PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") - int64_t size = numel() * SizeOfType(type); - if (holder_ == nullptr || holder_->size() < size + offset_) { - PlaceholderImpl *impl = nullptr; - if (holder_ == nullptr) { - std::cout << "holder null" << std::endl; - impl = new PlaceholderImpl(dims_, type); - holder_.reset(impl); - } else { - impl = static_cast(holder_.get()); - std::cout << "holder reize" << std::endl; - // holder_->resize(size); - } - impl->resize(dims_, type); - offset_ = 0; - } - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } - - /** - * @brief Return a pointer to mutable memory block. - * @note If not exist, then allocation. - */ - template - inline T *mutable_data() { - static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data(type_id().hash_code())); - } - - /** - * @brief Return a pointer to mutable memory block. - * - * @param[in] dims The dimensions of the memory block. - * @param[in] place The place of the memory block. - * - * @note If not exist, then allocation. - */ - template - inline T *mutable_data(DDim dims) { - static_assert(std::is_pod::value, "T must be POD"); - Resize(dims); - return mutable_data(); - } - - /** - * @brief Return a sub-tensor of the given tensor. - * - * @param[in] begin_idx The index of the start row(inclusive) to - * slice. - * The index number begins from 0. - * @param[in] end_idx The index of the end row(exclusive) to - * slice. - * The index number begins from 0. - */ - inline Tensor Slice(int begin_idx, int end_idx) const { - check_memory_size(); - PADDLE_MOBILE_ENFORCE(begin_idx >= 0, - "The start row index must be greater than 0.") - PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0], - "The end row index is out of bound.") - PADDLE_MOBILE_ENFORCE( - begin_idx < end_idx, - "The start row index must be lesser than the end row index") - if (dims_[0] == 1) { - return *this; - } else { - size_t base = numel() / dims_[0]; - Tensor dst; - dst.holder_ = holder_; - DDim dst_dims = dims_; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); - return dst; - } - } - - /*! Return a pointer to mutable memory block. */ - template - inline T *data() { - check_memory_size(); - PADDLE_MOBILE_ENFORCE( - (std::is_same::value || - holder_->type() == type_id().hash_code()), - "Tensor holds the wrong type, it holds %d, requested %d", - this->holder_->type(), type_id().hash_code()); - - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } - - /*! Return a pointer to constant memory block. */ - template - inline const T *data() const { - check_memory_size(); - PADDLE_MOBILE_ENFORCE( - (std::is_same::value || - holder_->type() == type_id().hash_code()), - "Tensor holds the wrong type, it holds %d, requested %d", - this->holder_->type(), type_id().hash_code()); - - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } - - private: - struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(DDim ddim, const kTypeId_t type) { - tensor_ = new zynqmp::Tensor(); - type_ = type; - std::vector v = framework::vectorize2int(ddim); - - zynqmp::LayoutType layout_type = zynqmp::NCHW; - switch (v.size()) { - case 1: - layout_type = zynqmp::N; - break; - case 2: - layout_type = zynqmp::NC; - break; - case 3: - layout_type = zynqmp::NHW; - break; - case 4: - layout_type = zynqmp::NCHW; - break; - } - zynqmp::Shape input_shape(layout_type, v); - - // for (int i = 0; i < v.size(); i++) { - // std::cout << ":" << v[i] << std::endl; - // } - zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16; - tensor_->mutableData(dtype, input_shape); - } - - virtual size_t size() const { return size_; } - - virtual void *ptr() const { - void *ptr = tensor_->data(); - return ptr; - } - - virtual kTypeId_t type() const { return type_; } - - virtual void set_type(const kTypeId_t type) { type_ = type; } - - virtual void resize(size_t size) { - if (size > capatity_) { - capatity_ = size; - // TODO(chonwhite) implement; - } - size_ = size; - } - - virtual void realloc(size_t size) { - capatity_ = size; - // TODO(chonwhite) implement; - size_ = size; - } - - void resize(DDim ddim, const kTypeId_t type) { - std::vector v = framework::vectorize2int(ddim); - - zynqmp::LayoutType layout_type = zynqmp::NCHW; - switch (v.size()) { - case 1: - layout_type = zynqmp::N; - break; - case 2: - layout_type = zynqmp::NC; - break; - case 3: - layout_type = zynqmp::NHW; - break; - case 4: - layout_type = zynqmp::NCHW; - break; - } - zynqmp::Shape input_shape(layout_type, v); - zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16; - tensor_->mutableData(dtype, input_shape); - } - - /*! the size of memory block. */ - size_t size_; - - size_t capatity_; - - /* the current type of memory */ - kTypeId_t type_; - - zynqmp::Tensor *tensor_; - // zynqmp::Shape* shape_; - }; -}; - -#ifdef PADDLE_MOBILE_DEBUG -inline Print &operator<<(Print &printer, const Tensor &tensor) { - printer << " dims: " << tensor.dims() << "\n"; - int stride = tensor.numel() / 20; - stride = stride > 0 ? stride : 1; - return printer; -} - -#endif - -inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) { - Tensor res; - res.ShareDataWith(src); - res.Resize(flatten_to_2d(src.dims(), num_col_dims)); - return res; -} - -} // namespace framework -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/io/api.cc b/mobile/src/io/api.cc deleted file mode 100644 index b9e7421b54..0000000000 --- a/mobile/src/io/api.cc +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "common/type_define.h" -#include "cstring" -#include "io/paddle_inference_api.h" - -namespace paddle_mobile { - -int PaddleDtypeSize(PaddleDType dtype) { - switch (dtype) { - case PaddleDType::FLOAT32: - return sizeof(float); - case PaddleDType::INT64: - return sizeof(int64_t); - default: - assert(false); - return -1; - } -} - -PaddleBuf::PaddleBuf(PaddleBuf&& other) - : data_(other.data_), - length_(other.length_), - memory_owned_(other.memory_owned_) { - other.memory_owned_ = false; - other.data_ = nullptr; - other.length_ = 0; -} - -PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } - -PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { - // only the buffer with external memory can be copied - if (!other.memory_owned_) { - data_ = other.data_; - length_ = other.length_; - memory_owned_ = other.memory_owned_; - } else { - Resize(other.length()); - memcpy(data_, other.data(), other.length()); - length_ = other.length(); - memory_owned_ = true; - } - return *this; -} - -void PaddleBuf::Resize(size_t length) { - // Only the owned memory can be reset, the external memory can't be changed. - if (length_ == length) return; - if (memory_owned_) { - Free(); - } - data_ = new char[length]; - length_ = length; - memory_owned_ = true; -} - -void PaddleBuf::Reset(void* data, size_t length) { - Free(); - memory_owned_ = false; - data_ = data; - length_ = length; -} - -void PaddleBuf::Free() { - if (memory_owned_ && data_) { - assert(length_ > 0); - delete[] static_cast(data_); - data_ = nullptr; - length_ = 0; - } -} - -} // namespace paddle_mobile diff --git a/mobile/src/io/api_paddle_mobile.cc b/mobile/src/io/api_paddle_mobile.cc deleted file mode 100644 index b01407bb37..0000000000 --- a/mobile/src/io/api_paddle_mobile.cc +++ /dev/null @@ -1,326 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "io/api_paddle_mobile.h" -#include -#include -#include -#include -#include "common/enforce.h" -#include "common/type_define.h" -#include "framework/tensor.h" -#ifdef PADDLE_MOBILE_FPGA -#include -#endif - -namespace paddle_mobile { - -template -PaddleMobilePredictor::PaddleMobilePredictor( - const PaddleMobileConfig &config) { - PADDLE_MOBILE_ENFORCE(Init(config) == true, - "paddle mobile predictor init failed!"); - config_ = config; -} - -template -bool PaddleMobilePredictor::Init(const PaddleMobileConfig &config) { - PaddleMobileConfigInternal configInternal; - configInternal.load_when_predict = config.load_when_predict; - if (config.pre_post_type == PaddleMobileConfig::UINT8_255) { - configInternal.pre_post_type = PrePostType::UINT8_255; - } - - configInternal.memory_optimization_level = - config.mem_opt ? MemoryOptimizationWithoutFeeds : NoMemoryOptimization; - - paddle_mobile_.reset(new PaddleMobile(configInternal)); -#ifdef PADDLE_MOBILE_CL - paddle_mobile_->SetCLPath(config.cl_path); -#endif - if (config.memory_pack.from_memory) { - DLOG << "load from memory!"; - paddle_mobile_->LoadCombinedMemory( - config.memory_pack.model_size, config.memory_pack.model_buf, - config.memory_pack.combined_params_size, - config.memory_pack.combined_params_buf, config.optimize, - config.quantification, config.batch_size, config.lod_mode); - } else if (!config.model_dir.empty()) { - paddle_mobile_->Load(config.model_dir, config.optimize, - config.quantification, config.batch_size, - config.lod_mode); - } else if (!config.prog_file.empty() && !config.param_file.empty()) { - paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize, - config.quantification, config.batch_size, - config.lod_mode); - } else { - LOG(kLOG_ERROR) << "fail to load inference model!"; - return false; - } - // If the openmp is open, set the thread num - paddle_mobile_->SetThreadNum(config.thread_num); - return true; -} -template -bool PaddleMobilePredictor::Run( - const std::vector &inputs, - std::vector *output_data, int batch_size) { - if (inputs.empty()) { - LOG(kLOG_ERROR) << "At least one output should be set with tensors' names."; - return false; - } - auto input = inputs[0]; - - if (input.lod.size() == 0 && input.shape.size() != 4) { - LOG(kLOG_ERROR) << "input shape not equal to 4!"; - return false; - } - std::vector dims; - for (auto d : input.shape) { - dims.push_back(static_cast(d)); - } - - // use tensor - framework::DDim ddim = framework::make_ddim(dims); - int input_length = framework::product(ddim); - if (input.lod.size() > 0) { - framework::LoDTensor input_lod_tensor; - paddle_mobile::framework::LoD lod{{}}; - for (int i = 0; i < input.lod.size(); ++i) { - lod[0].push_back(input.lod[i]); - } - input_lod_tensor.set_lod(lod); - input_lod_tensor.Resize(ddim); - if (input.dtype == UINT8) { - memcpy(input_lod_tensor.mutable_data(), - static_cast(input.data.data()), - input_length * sizeof(uint8_t)); - } else { - memcpy(input_lod_tensor.mutable_data(), - static_cast(input.data.data()), input_length * sizeof(T)); - } - paddle_mobile_->Predict(input_lod_tensor); - } else { - if (input.dtype == UINT8) { - framework::Tensor input_tensor(static_cast(input.data.data()), - ddim); - if (paddle_mobile_->Predict(input_tensor) != PMStatus::PMSuccess) { - return false; - } - } else { - framework::Tensor input_tensor(static_cast(input.data.data()), ddim); - if (paddle_mobile_->Predict(input_tensor) != PMStatus::PMSuccess) { - return false; - } - } - } - - auto output_tensor = paddle_mobile_->Fetch(); - - if (output_data->empty()) { - LOG(kLOG_ERROR) << "At least one output should be set with tensors' names."; - return false; - } - - auto &output = (*output_data)[0]; - int output_length = output_tensor->numel(); - std::vector tensor_shape = - framework::vectorize(output_tensor->dims()); - - for (auto d : tensor_shape) { - output.shape.push_back(static_cast(d)); - } - - if (output.dtype == UINT8) { - if (output.data.length() < output_length * sizeof(uint8_t)) { - output.data.Resize(output_length * sizeof(uint8_t)); - } - - memcpy(output.data.data(), output_tensor->template data(), - output_length * sizeof(uint8_t)); - } else { - if (output.data.length() < output_length * sizeof(T)) { - output.data.Resize(output_length * sizeof(T)); - } - - memcpy(output.data.data(), output_tensor->template data(), - output_length * sizeof(T)); - } - - return true; -} - -template -std::string PaddleMobilePredictor::GetExceptionMsg() { - return paddle_mobile_->GetExceptionMsg(); -} - -#ifdef PADDLE_MOBILE_FPGA -void ConvertPaddleTensors(const PaddleTensor &src, framework::Tensor *des) { - des->Resize(framework::make_ddim(src.shape)); - des->external_data = src.data.data(); - des->set_type(static_cast(static_cast(src.dtypeid))); - des->layout = - src.layout == LAYOUT_HWC ? framework::LAYOUT_HWC : framework::LAYOUT_CHW; -} - -void ConvertTensors(const framework::Tensor &src, PaddleTensor *des) { - des->shape = framework::vectorize2int(src.dims()); - des->dtypeid = static_cast(static_cast(src.type())); - des->layout = src.layout == framework::LAYOUT_HWC ? LAYOUT_HWC : LAYOUT_CHW; - - auto num = src.numel(); - if (src.type() == type_id()) { - des->data.Reset(const_cast(src.data()), - num * sizeof(float)); - } else if (src.type() == type_id()) { - des->data.Reset(const_cast(src.data()), - num * sizeof(int16_t)); - } else { - des->data.Reset(const_cast(src.data()), - num * sizeof(int8_t)); - } -} - -template -void PaddleMobilePredictor::FeedPaddleTensors( - const std::vector &inputs) { - auto num = inputs.size(); - std::vector tensors(num, framework::Tensor()); - for (int i = 0; i < num; i++) { - if (static_cast(static_cast(inputs[i].dtypeid)) == - type_id().hash_code()) { - tensors[i].init(type_id().hash_code()); - } else { - tensors[i].init(type_id().hash_code()); - } - ConvertPaddleTensors(inputs[i], &tensors[i]); - } - paddle_mobile_->FeedTensorData(tensors); -} - -template -void PaddleMobilePredictor::FetchPaddleTensors( - std::vector *outputs) { - // auto num = outputs->size(); - // PADDLE_MOBILE_ENFORCE(num > 0, "0 output pointers is not permitted"); - // std::vector tensors(num, nullptr); - outputs->clear(); - std::vector tensors; - paddle_mobile_->GetTensorResults(&tensors); - auto num = tensors.size(); - outputs->resize(num, PaddleTensor()); - for (int i = 0; i < num; i++) { - ConvertTensors(*tensors[i], &(*outputs)[i]); - } -} - -template -void PaddleMobilePredictor::FetchPaddleTensors(PaddleTensor *output, - int id) { - std::shared_ptr tensor_ptr = - paddle_mobile_->FetchResult(id); - void *data_addr = nullptr; - int data_sizeof = 1; - if (tensor_ptr.get()->type() == type_id().hash_code()) { - data_addr = tensor_ptr.get()->data(); - data_sizeof = sizeof(half); - } else if (tensor_ptr.get()->type() == type_id().hash_code()) { - data_addr = tensor_ptr.get()->data(); - data_sizeof = sizeof(float); - } else if (tensor_ptr.get()->type() == type_id().hash_code()) { - data_addr = tensor_ptr.get()->data(); - data_sizeof = sizeof(int8_t); - } else { - PADDLE_MOBILE_ENFORCE(0, "output typeid is not supported"); - } - size_t size = tensor_ptr.get()->numel() * data_sizeof; - fpga::fpga_invalidate(data_addr, size); - ConvertTensors(*(tensor_ptr.get()), output); - return; -} -template -void PaddleMobilePredictor::GetPaddleTensor(const std::string &name, - PaddleTensor *output) { - framework::Tensor *t = paddle_mobile_->GetTensorByName(name); - ConvertTensors(*t, output); -} - -template -void PaddleMobilePredictor::Predict_From_To(int start, int end) { - paddle_mobile_->Predict_From_To(start, end); -} - -#else -template -void PaddleMobilePredictor::Feed(const std::string &var_name, - const PaddleTensor &input) { - framework::DDim ddim = framework::make_ddim(input.shape); - framework::Tensor input_tensor(static_cast(input.data.data()), ddim); - paddle_mobile_->Feed(var_name, input_tensor); -} - -template -void PaddleMobilePredictor::Fetch(const std::string &var_name, - PaddleTensor *output) { - auto output_tensor = paddle_mobile_->Fetch(var_name); - auto ddim = output_tensor->dims(); - - output->shape.clear(); - for (int i = 0; i < ddim.size(); i++) { - output->shape.push_back(static_cast(ddim[i])); - } - - int length = output_tensor->numel() * sizeof(T); - if (output->data.length() < length) { - output->data.Resize(length); - } - memcpy(output->data.data(), output_tensor->template data(), length); -} - -template -bool PaddleMobilePredictor::Run() { - paddle_mobile_->Predict(); -} -#endif -template -PaddleMobilePredictor::~PaddleMobilePredictor() { - paddle_mobile_->Clear(); -} - -// A factory to help create difference predictor. -template <> -std::unique_ptr -CreatePaddlePredictor( - const PaddleMobileConfig &config) { - std::unique_ptr x; - if (config.precision == PaddleMobileConfig::FP32) { - if (config.device == PaddleMobileConfig::kCPU) { - x.reset(new PaddleMobilePredictor(config)); - } else if (config.device == PaddleMobileConfig::kFPGA) { - x.reset(new PaddleMobilePredictor(config)); - } else if (config.device == PaddleMobileConfig::kGPU_CL) { - x.reset(new PaddleMobilePredictor(config)); - } else { - LOG(kLOG_ERROR) << "unsupport device type!"; - return nullptr; - } - } else { - LOG(kLOG_ERROR) << "unsupport precision type!"; - return nullptr; - } - return std::move(x); -} - -} // namespace paddle_mobile diff --git a/mobile/src/io/api_paddle_mobile.h b/mobile/src/io/api_paddle_mobile.h deleted file mode 100644 index 6a33e2812a..0000000000 --- a/mobile/src/io/api_paddle_mobile.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "common/types.h" -#include "io/paddle_inference_api.h" -#include "io/paddle_mobile.h" - -namespace paddle_mobile { - -template -class PaddleMobilePredictor : public PaddlePredictor { - public: - PaddleMobilePredictor() = delete; - - explicit PaddleMobilePredictor(const PaddleMobileConfig& config); - - bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) override; - std::string GetExceptionMsg(); -#ifdef PADDLE_MOBILE_FPGA - void Predict_From_To(int start, int end) override; - void FeedPaddleTensors(const std::vector& inputs) override; - void FetchPaddleTensors(std::vector* outputs) override; - void FetchPaddleTensors(PaddleTensor* outputs, int id) override; - void GetPaddleTensor(const std::string& name, PaddleTensor* output) override; -#else - void Feed(const std::string& var_name, const PaddleTensor& input); - void Fetch(const std::string& var_name, PaddleTensor* output); - bool Run(); -#endif - - ~PaddleMobilePredictor() override; - - private: - std::unique_ptr> paddle_mobile_; - bool Init(const PaddleMobileConfig& config); - - PaddleMobileConfig config_; -}; - -} // namespace paddle_mobile diff --git a/mobile/src/io/ios_io/PaddleMobileCPU.h b/mobile/src/io/ios_io/PaddleMobileCPU.h deleted file mode 100644 index 07e10c0671..0000000000 --- a/mobile/src/io/ios_io/PaddleMobileCPU.h +++ /dev/null @@ -1,184 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once - -#import -#import - -@interface PaddleMobileCPUResult: NSObject - -/** - @b 输出指针 - */ -@property (assign, nonatomic, readonly) float *output; - -/** - @b 输出的 float 数 - * */ -@property (assign, nonatomic, readonly) int outputSize; - -/** - @b 维度信息, longlongValue - */ -@property (strong, nonatomic, readonly) NSArray *dim; - --(void)releaseOutput; - -@end - -@interface PaddleMobileCPUConfig: NSObject - -/** - @b 默认为 1, 多线程时, 建议设置为 2 - */ -@property (assign, nonatomic) int threadNum; - -/** - @b 是否开启运行时 infershape - */ -@property (assign, nonatomic) BOOL loddable; - -/** - @b 是否开启模型 op 融合优化 - */ -@property (assign, nonatomic) BOOL optimize; - -/** - @b 是否预测时初始化内存,用于处理可变输入 - */ -@property (assign, nonatomic) BOOL loadWhenPredict; - -@end - -@interface PaddleMobileCPU : NSObject - -/** - @b 创建对象 - - @param config 配置 - @return paddlemobile CPU 对象 - */ -- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config; - -/** - @b 加载模型 - - @param modelPath 模型路径 - @param weighsPath 权重路径 - @return 是否加载成功 - */ -- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath; - -/** - @b 加载散开形式的模型, 需传入模型的目录 - - @param modelAndWeightPath 模型和权重的路径 - @return 是否加载成功 - */ -- (BOOL)load:(NSString *)modelAndWeightPath; - -/** - @b 从内存中加载模型 - - @param modelLen 模型大小(字节数) - @param modelBuf 模型在内存中的位置 - @param combinedParamsLen 权重大小(字节数) - @param combinedParamsBuf 权重在内存中的位置 - @return 是否加载成功 - */ -- (BOOL)LoadCombinedMemory:(size_t)modelLen - andModelBuf:(const uint8_t *)modelBuf - andModelParamsLen:(size_t)combinedParamsLen - andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf; - -/** - @b 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值 - - @param image 输入的图像 - @param output 预处理后的输出 - @param means 预处理中 means - @param scale 预处理中的 scale - @param dim 预处理后的维度 - */ --(void)preprocess:(CGImageRef)image - output:(float *)output - means:(NSArray *)means - scale:(float)scale - dim:(NSArray *)dim; - -/** - 进行预测 - - @param input 输入 - @param dim 输入维度 - @return 输出结果 - */ -- (PaddleMobileCPUResult *)predictInput:(float *)input - dim:(NSArray *)dim; - -/** - @b 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值 - - @param image 输入图像 - @param dim 输入维度 - @param means 预处理中 means - @param scale 预处理中 scale - @return 预测结果 - */ -- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray *)dim means:(NSArray *)means scale:(float)scale; - -/** - @b 进行预测, means stds和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值 - - @param image 输入图像 - @param dim 输入维度 - @param means 预处理中 means - @param stds 预处理中 stds - @param scale 预处理中 scale - @return 预测结果 - */ -- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray *)dim means:(NSArray *)means stds:(NSArray *)stds scale:(float)scale; - -/** - @b 进行预测, 预处理 means 值为 0, scale 值为 1 - - @param image 输入图像 - @param dim 输入维度 - @return 预测结果 - */ -- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray *)dim; - - -/** - @b 取出模型描述中 key 为 "fetch" 对应的输出 - - @return 预测结果 - */ -- (PaddleMobileCPUResult *)fetchOutput; - -/** - @b 当输出为多个时, 可用此函数取出对应的输出 - - @param key 模型中输出的key - @return 预测结果 - */ -- (PaddleMobileCPUResult *)fetchOutputWithKey:(NSString *)key; - -/** - @b 清理内存 - */ -- (void)clear; - -@end diff --git a/mobile/src/io/ios_io/PaddleMobileCPU.mm b/mobile/src/io/ios_io/PaddleMobileCPU.mm deleted file mode 100644 index b952ad8e60..0000000000 --- a/mobile/src/io/ios_io/PaddleMobileCPU.mm +++ /dev/null @@ -1,410 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#import "PaddleMobileCPU.h" -#import "framework/load_ops.h" -#import "framework/tensor.h" -#import "io/paddle_mobile.h" -#import -#import - -@interface PaddleMobileCPUResult() - --(void)toSetOutput:(float *)output; - --(void)toSetOutputSize:(int)outputSize; - -@end - -@implementation PaddleMobileCPUResult - --(void)releaseOutput { - delete [] _output; - _output = nil; - _outputSize = 0; -} - --(void)toSetOutput:(float *)output { - _output = output; -} - --(void)toSetOutputSize:(int)outputSize { - _outputSize = outputSize; -} - --(void)toSetDim:(NSArray *)dim { - _dim = dim; -} - -@end - -@implementation PaddleMobileCPUConfig - --(instancetype)init { - if (self = [super init]) { - self.threadNum = 1; - self.optimize = YES; - } - return self; -} - -@end - -@interface PaddleMobileCPU() -{ - paddle_mobile::PaddleMobile *pam_; - BOOL loaded_; -} - -@property (strong, nonatomic) PaddleMobileCPUConfig *config; - -@end - -@implementation PaddleMobileCPU - -static std::mutex shared_mutex; - -- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config { - if (self = [super init]) { - paddle_mobile::PaddleMobileConfigInternal configInternal; - configInternal.load_when_predict = config.loadWhenPredict; - pam_ = new paddle_mobile::PaddleMobile(); - _config = config; - } - return self; -} - --(instancetype)init { - if (self = [super init]) { - _config = [[PaddleMobileCPUConfig alloc] init]; - pam_ = new paddle_mobile::PaddleMobile(); - } - return self; -} - -- (void)dealloc { - if (pam_) { - delete pam_; - pam_ = nullptr; - } -} - -+ (instancetype)sharedInstance{ - static dispatch_once_t onceToken; - static id sharedManager = nil; - dispatch_once(&onceToken, ^{ - sharedManager = [[[self class] alloc] init]; - }); - return sharedManager; -} - -- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath { - std::lock_guard lock(shared_mutex); - std::string model_path_str = std::string([modelPath UTF8String]); - std::string weights_path_str = std::string([weighsPath UTF8String]); - pam_->SetThreadNum(self.config.threadNum); - if (loaded_ = pam_->Load(model_path_str, weights_path_str, self.config.optimize, false, 1, self.config.loddable)) { - return YES; - } else { - return NO; - } -} - -- (BOOL)LoadCombinedMemory:(size_t)modelLen - andModelBuf:(const uint8_t *)modelBuf - andModelParamsLen:(size_t)combinedParamsLen - andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf { - std::lock_guard lock(shared_mutex); - pam_->SetThreadNum(self.config.threadNum); - return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, - const_cast(combinedParamsBuf), self.config.optimize, false, 1, self.config.loddable); -} - -- (BOOL)load:(NSString *)modelAndWeightPath{ - std::lock_guard lock(shared_mutex); - std::string model_path_str = std::string([modelAndWeightPath UTF8String]); - if (loaded_ = pam_->Load(model_path_str, self.config.optimize, false, 1, self.config.loddable)) { - return YES; - } else { - return NO; - } -} - - --(void)preprocess:(CGImageRef)image - output:(float *)output - means:(NSArray *)means - scale:(float)scale - dim:(NSArray *)dim { - std::lock_guard lock(shared_mutex); - - if (means == nil) { - means = @[@0, @0, @0]; - } - - // dim to c++ vector, get numel - std::vector dim_vec; - int numel = 1; - for (int k = 0; k < dim.count; ++k) { - int d = dim[k].intValue; - numel *= d; - dim_vec.push_back(d); - } - - const int sourceRowBytes = CGImageGetBytesPerRow(image); - const int imageWidth = CGImageGetWidth(image); - const int imageHeight = CGImageGetHeight(image); - const int imageChannels = 4; - CGDataProviderRef provider = CGImageGetDataProvider(image); - CFDataRef cfData = CGDataProviderCopyData(provider); - const UInt8 *input = CFDataGetBytePtr(cfData); - - int wanted_input_width = dim_vec[3]; - int wanted_input_height = dim_vec[2]; - int wanted_input_channels = dim_vec[1]; - - for (int c = 0; c < wanted_input_channels; ++c) { - float *out_channel = output + c * wanted_input_height * wanted_input_width; - for (int y = 0; y < wanted_input_height; ++y) { - float *out_row = out_channel + y * wanted_input_width; - for (int x = 0; x < wanted_input_width; ++x) { - int in_row = (y * imageHeight) / wanted_input_height; - int in_col = (x * imageWidth) / wanted_input_width; - const UInt8 *in_pixel = input + (in_row * sourceRowBytes) + (in_col * imageChannels); - float *out_pos = out_row + x; - *out_pos = (in_pixel[2 - c] - means[c].floatValue) * scale; - } - } - } - -} - --(void)preprocess:(const UInt8 *)input output:(float *)output bytesPerRow:(int)bytesPerRow imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray *)means stds:(NSArray *)stds scale:(float)scale dim:(std::vector)dim { - if (means == nil) { - means = @[@0, @0, @0]; - } - if (stds == nil) { - stds = @[@1, @1, @1]; - } - - int wanted_input_width = dim[3]; - int wanted_input_height = dim[2]; - int wanted_input_channels = dim[1]; - - for (int c = 0; c < wanted_input_channels; ++c) { - float *out_channel = output + c * wanted_input_height * wanted_input_width; - for (int y = 0; y < wanted_input_height; ++y) { - float *out_row = out_channel + y * wanted_input_width; - for (int x = 0; x < wanted_input_width; ++x) { - int in_row = (y * imageHeight) / wanted_input_height; - int in_col = (x * imageWidth) / wanted_input_width; - const UInt8 *in_pixel = input + (in_row * bytesPerRow) + (in_col * imageChannels); - float *out_pos = out_row + x; - *out_pos = (in_pixel[2 - c] - means[c].floatValue) / stds[c].floatValue * scale; - } - } - } -} - -- (PaddleMobileCPUResult *)predictInput:(float *)input - dim:(NSArray *)dim { - std::lock_guard lock(shared_mutex); - if (!loaded_) { - printf("PaddleMobile doesn't be loaded yet"); - return nil; - } - - if (dim.count != 4) { - printf("dim must have 4 elements"); - return nil; - } - - // dim to c++ vector, get numel - std::vector dim_vec; - int numel = 1; - for (int k = 0; k < dim.count; ++k) { - int d = dim[k].intValue; - numel *= d; - dim_vec.push_back(d); - } - - paddle_mobile::framework::Tensor input_tensor; - paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec); - float *input_ptr = input_tensor.mutable_data(dims); - memcpy(input_ptr, input, - numel * sizeof(float)); - - pam_->Predict(input_tensor); - std::shared_ptr output = pam_->Fetch(); - - auto output_dims = output->dims(); - std::vector output_dim_vec = vectorize(output_dims); - NSMutableArray *ocDim = [NSMutableArray array]; - for (int i = 0; i < output_dim_vec.size(); ++i) { - NSNumber *num = [NSNumber numberWithLongLong:output_dim_vec[i]]; - [ocDim addObject:num]; - } - - float *output_pointer = new float[output->numel()]; - - memcpy(output_pointer, output->data(), - output->numel() * sizeof(float)); - - PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init]; - [cpuResult toSetOutput: output_pointer]; - [cpuResult toSetDim: ocDim]; - [cpuResult toSetOutputSize: output->numel()]; - - return cpuResult; -} - -- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray *)dim means:(NSArray *)means stds:(NSArray *)stds scale:(float)scale { - std::lock_guard lock(shared_mutex); - if (!loaded_) { - printf("PaddleMobile doesn't be loaded yet"); - return nil; - } - - if (dim.count != 4) { - printf("dim must have 4 elements"); - return nil; - } - - // dim to c++ vector, get numel - std::vector dim_vec; - int numel = 1; - for (int k = 0; k < dim.count; ++k) { - int d = dim[k].intValue; - numel *= d; - dim_vec.push_back(d); - } - - const int sourceRowBytes = CGImageGetBytesPerRow(image); - const int image_width = CGImageGetWidth(image); - const int image_height = CGImageGetHeight(image); - const int image_channels = 4; - CGDataProviderRef provider = CGImageGetDataProvider(image); - CFDataRef cfData = CGDataProviderCopyData(provider); - const UInt8 *input = CFDataGetBytePtr(cfData); - - // sample image - float *output = (float *)malloc(numel*sizeof(float)); - [self preprocess:input output:output bytesPerRow:sourceRowBytes imageWidth:image_width imageHeight:image_height imageChannels:image_channels means:means stds:stds scale:scale dim:dim_vec]; - float *dataPointer = nullptr; - if (nullptr != output) { - dataPointer = output; - } else { - return nil; - } - - paddle_mobile::framework::Tensor input_tensor; - paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec); - float *input_ptr = input_tensor.mutable_data(dims); - memcpy(input_ptr, dataPointer, - numel * sizeof(float)); - - pam_->Predict(input_tensor); - std::shared_ptr output_tensor = pam_->Fetch(); - - auto output_dims = output_tensor->dims(); - std::vector output_dim_vec = vectorize(output_dims); - NSMutableArray *ocDim = [NSMutableArray array]; - for (int i = 0; i < output_dim_vec.size(); ++i) { - NSNumber *num = [NSNumber numberWithLongLong:output_dim_vec[i]]; - [ocDim addObject:num]; - } - - float *output_pointer = new float[output_tensor->numel()]; - memcpy(output_pointer, output_tensor->data(), - output_tensor->numel() * sizeof(float)); - PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init]; - [cpuResult toSetOutput: output_pointer]; - [cpuResult toSetDim: ocDim]; - [cpuResult toSetOutputSize: output_tensor->numel()]; - - free(output); - CFRelease(cfData); - cfData = NULL; - - return cpuResult; -} - -- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray *)dim { - return [self predict:image dim:dim means:nil stds:nil scale:1]; -} - -- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray *)dim means:(NSArray *)means scale:(float)scale { - return [self predict:image dim:dim means:means stds:nil scale:scale]; -} - -- (PaddleMobileCPUResult *)fetchOutput{ - if (pam_ && loaded_) { - auto tensorPtr = pam_->Fetch(); - float *output_pointer = new float[tensorPtr->numel()]; - memcpy(output_pointer, tensorPtr->data(), - tensorPtr->numel() * sizeof(float)); - auto dims = tensorPtr->dims(); - std::vector dim_vec = vectorize(dims); - - - NSMutableArray *ocDim = [NSMutableArray array]; - for (int i = 0; i < dim_vec.size(); ++i) { - NSNumber *num = [NSNumber numberWithLongLong:dim_vec[i]]; - [ocDim addObject:num]; - } - - PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init]; - [cpuResult toSetOutput: output_pointer]; - [cpuResult toSetDim: ocDim]; - [cpuResult toSetOutputSize: tensorPtr->numel()]; - - return cpuResult; - } - return nil; -} - -- (PaddleMobileCPUResult *)fetchOutputWithKey:(NSString *)key{ - if (pam_ && loaded_ && key.length) { - auto tensorPtr = pam_->Fetch(std::string([key cStringUsingEncoding:NSUTF8StringEncoding])); - float *output_pointer = new float[tensorPtr->numel()]; - memcpy(output_pointer, tensorPtr->data(), - tensorPtr->numel() * sizeof(float)); - - auto dims = tensorPtr->dims(); - std::vector dim_vec = vectorize(dims); - - NSMutableArray *ocDim = [NSMutableArray array]; - for (int i = 0; i < dim_vec.size(); ++i) { - NSNumber *num = [NSNumber numberWithLongLong:dim_vec[i]]; - [ocDim addObject:num]; - } - - PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init]; - [cpuResult toSetOutput: output_pointer]; - [cpuResult toSetDim: ocDim]; - [cpuResult toSetOutputSize: tensorPtr->numel()]; - - return cpuResult; - } - return nil; -} - -- (void)clear{ - std::lock_guard lock(shared_mutex); - if (pam_) { - pam_->Clear(); - } -} - -@end diff --git a/mobile/src/io/jni/PML.java b/mobile/src/io/jni/PML.java deleted file mode 100644 index 3f162dcf9e..0000000000 --- a/mobile/src/io/jni/PML.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.baidu.paddle; - -public class PML { - /** - * load seperated model - * - * @param modelDir model dir - * @return isloadsuccess - */ - public static native boolean load(String modelDir, Boolean lodMode); - - /** - * load combined model - * - * @param modelPath model file path - * @param paramPath param file path - * @return isloadsuccess - */ - public static native boolean loadCombined(String modelPath, String paramPath, Boolean lodMode); - - /** - * load model and qualified params - * - * @param modelDir qualified model dir - * @return isloadsuccess - */ - public static native boolean loadQualified(String modelDir, Boolean lodMode); - - /** - * load model and qualified combined params - * - * @param modelPath model file path - * @param paramPath qualified param path - * @return isloadsuccess - */ - public static native boolean loadCombinedQualified(String modelPath, String paramPath, Boolean lodMode); - - /** - * predict image - * - * @param buf of pretreated image (as your model like) - * @param ddims format of your input - * @return result - */ - public static native float[] predictImage(float[] buf, int[] ddims); - - public static native float[] fetch(String varName); - - public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[] meanValues); - - // predict with variable length input - // support only one input and one output currently - public static native float[] predictLod(float[] buf); - - /** - * clear model data - */ - public static native void clear(); - - /** - * setThread num when u enable openmp - * - * @param threadCount threadCount - */ - public static native void setThread(int threadCount); -} diff --git a/mobile/src/io/jni/paddle_mobile_jni.cpp b/mobile/src/io/jni/paddle_mobile_jni.cpp deleted file mode 100644 index ee336889a2..0000000000 --- a/mobile/src/io/jni/paddle_mobile_jni.cpp +++ /dev/null @@ -1,465 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ANDROID - -#include "io/jni/paddle_mobile_jni.h" -#include -#include -#include -#include "common/log.h" -#include "framework/tensor.h" -#include "io/paddle_mobile.h" - -#ifdef ENABLE_EXCEPTION -#include "common/enforce.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -namespace paddle_mobile { -namespace jni { - -using framework::DDim; -using framework::Program; -using framework::Tensor; -using paddle_mobile::CPU; -using std::string; - -paddle_mobile::PaddleMobile paddle_mobile; -static std::mutex shared_mutex; - -PaddleMobile *getPaddleMobileInstance() { return &paddle_mobile; } - -string jstring2cppstring(JNIEnv *env, jstring jstr) { - const char *cstr = env->GetStringUTFChars(jstr, 0); - string cppstr(cstr); - env->ReleaseStringUTFChars(jstr, cstr); - return cppstr; -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env, - jclass thiz, - jstring modelPath, - jboolean lodMode) { - std::lock_guard lock(shared_mutex); - ANDROIDLOGI("load invoked"); - bool optimize = true; - bool isLoadOk = false; -#ifdef ENABLE_EXCEPTION - try { - isLoadOk = getPaddleMobileInstance()->Load( - jstring2cppstring(env, modelPath), optimize, false, 1, - static_cast(lodMode)); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - isLoadOk = false; - } -#else - isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath), - optimize, false, 1, - static_cast(lodMode)); -#endif - return static_cast(isLoadOk); -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified( - JNIEnv *env, jclass thiz, jstring modelPath, jboolean lodMode) { - std::lock_guard lock(shared_mutex); - - ANDROIDLOGI("loadQualified invoked"); - bool optimize = true; - bool qualified = true; - bool isLoadOk = false; - -#ifdef ENABLE_EXCEPTION - try { - isLoadOk = getPaddleMobileInstance()->Load( - jstring2cppstring(env, modelPath), optimize, qualified, 1, - static_cast(lodMode)); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - isLoadOk = false; - } -#else - isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath), - optimize, qualified, 1, - static_cast(lodMode)); -#endif - - return static_cast(isLoadOk); -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined( - JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath, - jboolean lodMode) { - std::lock_guard lock(shared_mutex); - ANDROIDLOGI("loadCombined invoked"); - bool optimize = true; - bool isLoadOk = false; - -#ifdef ENABLE_EXCEPTION - try { - isLoadOk = getPaddleMobileInstance()->Load( - jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath), - optimize, false, 1, static_cast(lodMode)); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - isLoadOk = false; - } -#else - isLoadOk = getPaddleMobileInstance()->Load( - jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath), - optimize, false, 1, static_cast(lodMode)); -#endif - return static_cast(isLoadOk); -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified( - JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath, - jboolean lodMode) { - std::lock_guard lock(shared_mutex); - ANDROIDLOGI("loadCombinedQualified invoked"); - bool optimize = true; - bool qualified = true; - bool isLoadOk = false; - -#ifdef ENABLE_EXCEPTION - try { - isLoadOk = getPaddleMobileInstance()->Load( - jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath), - optimize, qualified, 1, static_cast(lodMode)); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - isLoadOk = false; - } -#else - isLoadOk = getPaddleMobileInstance()->Load( - jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath), - optimize, qualified, 1, static_cast(lodMode)); -#endif - return static_cast(isLoadOk); -} - -JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( - JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) { - std::lock_guard lock(shared_mutex); - - ANDROIDLOGI("predictImage invoked"); - jfloatArray result = NULL; - -#ifdef ENABLE_EXCEPTION - ANDROIDLOGE("ENABLE_EXCEPTION!"); - - try { - jsize ddim_size = env->GetArrayLength(ddims); - if (ddim_size != 4) { - ANDROIDLOGE("ddims size not equal to 4"); - } - jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL); - framework::DDim ddim = framework::make_ddim( - {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]}); - int length = framework::product(ddim); - int count = 0; - float *dataPointer = nullptr; - if (nullptr != buf) { - dataPointer = env->GetFloatArrayElements(buf, NULL); - } - framework::Tensor input; - input.Resize(ddim); - auto input_ptr = input.mutable_data(); - for (int i = 0; i < length; i++) { - input_ptr[i] = dataPointer[i]; - } - getPaddleMobileInstance()->Predict(input); - auto output = getPaddleMobileInstance()->Fetch(); - count = output->numel(); - result = env->NewFloatArray(count); - env->SetFloatArrayRegion(result, 0, count, output->data()); - env->ReleaseIntArrayElements(ddims, ddim_ptr, 0); - env->DeleteLocalRef(ddims); - env->ReleaseFloatArrayElements(buf, dataPointer, 0); - env->DeleteLocalRef(buf); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - } -#else - jsize ddim_size = env->GetArrayLength(ddims); - if (ddim_size != 4) { - ANDROIDLOGE("ddims size not equal to 4"); - } - jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL); - framework::DDim ddim = framework::make_ddim( - {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]}); - int length = framework::product(ddim); - int count = 0; - float *dataPointer = nullptr; - if (nullptr != buf) { - dataPointer = env->GetFloatArrayElements(buf, NULL); - } - framework::Tensor input; - input.Resize(ddim); - auto input_ptr = input.mutable_data(); - for (int i = 0; i < length; i++) { - input_ptr[i] = dataPointer[i]; - } - getPaddleMobileInstance()->Predict(input); - auto output = getPaddleMobileInstance()->Fetch(); - count = output->numel(); - result = env->NewFloatArray(count); - env->SetFloatArrayRegion(result, 0, count, output->data()); - env->ReleaseIntArrayElements(ddims, ddim_ptr, 0); - env->DeleteLocalRef(ddims); - env->ReleaseFloatArrayElements(buf, dataPointer, 0); - env->DeleteLocalRef(buf); -// env->DeleteLocalRef(dataPointer); -#endif - - ANDROIDLOGI("predictImage finished"); - return result; -} - -JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_fetch(JNIEnv *env, - jclass thiz, - jstring varName) { - jfloatArray result = NULL; - -#ifdef ENABLE_EXCEPTION - try { - auto output = - getPaddleMobileInstance()->Fetch(jstring2cppstring(env, varName)); - int count = output->numel(); - result = env->NewFloatArray(count); - env->SetFloatArrayRegion(result, 0, count, output->data()); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - } -#else - auto output = - getPaddleMobileInstance()->Fetch(jstring2cppstring(env, varName)); - int count = output->numel(); - result = env->NewFloatArray(count); - env->SetFloatArrayRegion(result, 0, count, output->data()); -#endif - - return result; -} - -inline int yuv_to_rgb(int y, int u, int v, float *r, float *g, float *b) { - int r1 = (int)(y + 1.370705 * (v - 128)); // NOLINT - int g1 = (int)(y - 0.698001 * (u - 128) - 0.703125 * (v - 128)); // NOLINT - int b1 = (int)(y + 1.732446 * (u - 128)); // NOLINT - - r1 = (int)fminf(255, fmaxf(0, r1)); // NOLINT - g1 = (int)fminf(255, fmaxf(0, g1)); // NOLINT - b1 = (int)fminf(255, fmaxf(0, b1)); // NOLINT - *r = r1; - *g = g1; - *b = b1; - - return 0; -} -void convert_nv21_to_matrix(uint8_t *nv21, float *matrix, int width, int height, - int targetWidth, int targetHeight, float *means) { - const uint8_t *yData = nv21; - const uint8_t *vuData = nv21 + width * height; - - const int yRowStride = width; - const int vuRowStride = width; - - float scale_x = width * 1.0 / targetWidth; - float scale_y = height * 1.0 / targetHeight; - - for (int j = 0; j < targetHeight; ++j) { - int y = j * scale_y; - const uint8_t *pY = yData + y * yRowStride; - const uint8_t *pVU = vuData + (y >> 1) * vuRowStride; - for (int i = 0; i < targetWidth; ++i) { - int x = i * scale_x; - const int offset = ((x >> 1) << 1); - float r = 0; - float g = 0; - float b = 0; - yuv_to_rgb(pY[x], pVU[offset + 1], pVU[offset], &r, &g, &b); - int r_index = j * targetWidth + i; - int g_index = r_index + targetWidth * targetHeight; - int b_index = g_index + targetWidth * targetHeight; - matrix[r_index] = r - means[0]; - matrix[g_index] = g - means[1]; - matrix[b_index] = b - means[2]; - } - } -} - -JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv( - JNIEnv *env, jclass thiz, jbyteArray yuv_, jint imgwidth, jint imgHeight, - jintArray ddims, jfloatArray meanValues) { - std::lock_guard lock(shared_mutex); - - ANDROIDLOGI("predictYuv invoked"); - jfloatArray result = NULL; - -#ifdef ENABLE_EXCEPTION - try { - jsize ddim_size = env->GetArrayLength(ddims); - if (ddim_size != 4) { - ANDROIDLOGE("ddims size not equal to 4"); - } - jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL); - framework::DDim ddim = framework::make_ddim( - {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]}); - int length = framework::product(ddim); - float matrix[length]; // NOLINT - jbyte *yuv = env->GetByteArrayElements(yuv_, NULL); - float *meansPointer = nullptr; - if (nullptr != meanValues) { - meansPointer = env->GetFloatArrayElements(meanValues, NULL); - } - convert_nv21_to_matrix(reinterpret_cast(yuv), matrix, imgwidth, - imgHeight, ddim[3], ddim[2], meansPointer); - int count = 0; - framework::Tensor input; - input.Resize(ddim); - auto input_ptr = input.mutable_data(); - for (int i = 0; i < length; i++) { - input_ptr[i] = matrix[i]; - } - getPaddleMobileInstance()->Predict(input); - auto output = getPaddleMobileInstance()->Fetch(); - count = output->numel(); - result = env->NewFloatArray(count); - env->SetFloatArrayRegion(result, 0, count, output->data()); - env->ReleaseByteArrayElements(yuv_, yuv, 0); - env->ReleaseIntArrayElements(ddims, ddim_ptr, 0); - env->ReleaseFloatArrayElements(meanValues, meansPointer, 0); - ANDROIDLOGI("predictYuv finished"); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - } -#else - jsize ddim_size = env->GetArrayLength(ddims); - if (ddim_size != 4) { - ANDROIDLOGE("ddims size not equal to 4"); - } - jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL); - framework::DDim ddim = framework::make_ddim( - {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]}); - int length = framework::product(ddim); - float matrix[length]; // NOLINT - jbyte *yuv = env->GetByteArrayElements(yuv_, NULL); - float *meansPointer = nullptr; - if (nullptr != meanValues) { - meansPointer = env->GetFloatArrayElements(meanValues, NULL); - } - convert_nv21_to_matrix((uint8_t *)yuv, matrix, imgwidth, // NOLINT - imgHeight, ddim[3], ddim[2], meansPointer); - int count = 0; - framework::Tensor input; - input.Resize(ddim); - auto input_ptr = input.mutable_data(); - for (int i = 0; i < length; i++) { - input_ptr[i] = matrix[i]; - } - getPaddleMobileInstance()->Predict(input); - auto output = getPaddleMobileInstance()->Fetch(); - count = output->numel(); - result = env->NewFloatArray(count); - env->SetFloatArrayRegion(result, 0, count, output->data()); - env->ReleaseByteArrayElements(yuv_, yuv, 0); - env->ReleaseIntArrayElements(ddims, ddim_ptr, 0); - env->ReleaseFloatArrayElements(meanValues, meansPointer, 0); - ANDROIDLOGI("predictYuv finished"); -#endif - - return result; -} -JNIEXPORT jlongArray JNICALL -Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) { - std::lock_guard lock(shared_mutex); - - jlong *ddim_ptr = env->GetLongArrayElements(buf, NULL); - jsize ddim_size = env->GetArrayLength(buf); - std::vector ids; - - for (int i = 0; i < ddim_size; ++i) { - jlong x = ddim_ptr[i]; - ids.push_back((int64_t)x); - } - - paddle_mobile::framework::LoDTensor words; - - auto size = static_cast(ids.size()); - - paddle_mobile::framework::LoD lod{{0, ids.size()}}; - DDim dims{size, 1}; - words.Resize(dims); - words.set_lod(lod); - auto *pdata = words.mutable_data(); - size_t n = words.numel() * sizeof(int64_t); - memcpy(pdata, ids.data(), n); - paddle_mobile.Predict(words); - auto vec_result = paddle_mobile.Fetch(); - int count = vec_result->numel(); - jlongArray result = NULL; - ANDROIDLOGE("predict nlp size %d", count); - - result = env->NewLongArray(count); - env->SetLongArrayRegion(result, 0, count, vec_result->data()); - - env->ReleaseLongArrayElements(buf, ddim_ptr, 0); - return result; -} - -JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env, - jclass thiz, - jint threadCount) { - std::lock_guard lock(shared_mutex); - - ANDROIDLOGI("setThreadCount %d", threadCount); -#ifdef ENABLE_EXCEPTION - try { - getPaddleMobileInstance()->SetThreadNum(static_cast(threadCount)); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - } -#else - getPaddleMobileInstance()->SetThreadNum(static_cast(threadCount)); -#endif -} - -JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env, - jclass thiz) { - std::lock_guard lock(shared_mutex); - -#ifdef ENABLE_EXCEPTION - try { - getPaddleMobileInstance()->Clear(); - } catch (paddle_mobile::PaddleMobileException &e) { - ANDROIDLOGE("jni got an PaddleMobileException! ", e.what()); - } -#else - getPaddleMobileInstance()->Clear(); -#endif -} - -} // namespace jni -} // namespace paddle_mobile - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/mobile/src/io/jni/paddle_mobile_jni.h b/mobile/src/io/jni/paddle_mobile_jni.h deleted file mode 100644 index 16d6768723..0000000000 --- a/mobile/src/io/jni/paddle_mobile_jni.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef ANDROID -#include - -#ifdef __cplusplus -extern "C" { -#endif -namespace paddle_mobile { -namespace jni { -/** - * load separated model for android - */ -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env, - jclass thiz, - jstring modelPath, - jboolean lodMode); - -/** - * load separated qualified model for android - */ -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified( - JNIEnv *env, jclass thiz, jstring modelPath, jboolean lodMode); -/** - * load combined model for android - */ -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined( - JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath, - jboolean lodMode); - -/** - * load combined qualified model for android - */ -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified( - JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath, - jboolean lodMode); - -/** - * object detection for anroid - */ -JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( - JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims); - -JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_fetch(JNIEnv *env, - jclass thiz, - jstring varName); - -/** - * object detection for anroid - */ -JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv( - JNIEnv *env, jclass thiz, jbyteArray yuv, jint imgwidth, jint imgHeight, - jintArray ddims, jfloatArray meanValues); - -/** - * object detection for anroid - */ -JNIEXPORT jlongArray JNICALL -Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf); - -/** - * setThreadCount for multithread - */ -JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env, - jclass thiz, - jint threadCount); -/** - * clear data of the net when destroy for android - */ -JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env, - jclass thiz); -} // namespace jni -} // namespace paddle_mobile -#ifdef __cplusplus -} -#endif - -#endif diff --git a/mobile/src/io/loader.h b/mobile/src/io/loader.h deleted file mode 100644 index 7a04da1230..0000000000 --- a/mobile/src/io/loader.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "common/types.h" -#include "framework/program/program.h" - -namespace paddle_mobile { - -template -class Loader { - public: - const framework::Program Load(const std::string &dirname, - bool optimize = false, - bool quantification = false, - bool can_add_split = false); - - const framework::Program Load(const std::string &model_path, - const std::string ¶_path, - bool optimize = false, - bool quantification = false); - - const framework::Program LoadCombinedMemory( - size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - const uint8_t *combined_params_buf, bool optimize = false, - bool quantification = false); - - private: - const framework::Program LoadProgram(const std::string &model_path, - bool optimize = false, - bool quantification = false, - bool can_add_split = false); -}; - -} // namespace paddle_mobile diff --git a/mobile/src/io/opencl_interface.cpp b/mobile/src/io/opencl_interface.cpp deleted file mode 100644 index 636cd1b760..0000000000 --- a/mobile/src/io/opencl_interface.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef PADDLE_MOBILE_CL - -#include "io/opencl_interface.h" -#include "framework/cl/cl_engine.h" -#include "framework/cl/cl_scope.h" - -namespace paddle_mobile { - -cl_context getContext() { - return framework::CLEngine::Instance()->getContext(); -} - -cl_command_queue getClCommandQueue() { - return framework::CLEngine::Instance()->getClCommandQueue(); -} - -bool isInitSuccess() { - prepareOpenclRuntime(); - return framework::CLEngine::Instance()->isInitSuccess(); -} - -bool prepareOpenclRuntime() { -#ifdef PREPARE_OPENCL_RUNTIME - DLOG << "cl runtime prepared. "; - cl_uint numPlatforms; // the NO. of platforms - cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); - if (status == CL_SUCCESS) { - if (numPlatforms > 0) { - cl_platform_id *platforms = reinterpret_cast( - malloc(numPlatforms * sizeof(cl_platform_id))); - status = clGetPlatformIDs(numPlatforms, platforms, NULL); - free(platforms); - } - } -#endif - return true; -} - -} // namespace paddle_mobile -#endif diff --git a/mobile/src/io/opencl_interface.h b/mobile/src/io/opencl_interface.h deleted file mode 100644 index 6a3608790a..0000000000 --- a/mobile/src/io/opencl_interface.h +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef PADDLE_MOBILE_CL -#include "CL/cl.h" - -namespace paddle_mobile { - -cl_context getContext(); -cl_command_queue getClCommandQueue(); -bool isInitSuccess(); -bool prepareOpenclRuntime(); - -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/io/paddle_inference_api.h b/mobile/src/io/paddle_inference_api.h deleted file mode 100644 index 6f3ba182f6..0000000000 --- a/mobile/src/io/paddle_inference_api.h +++ /dev/null @@ -1,238 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file contains the definition of a simple Inference API for Paddle. - * - * ATTENTION: It requires some C++ features, for lower version C++ or C, we - * might release another API. - */ - -#pragma once - -#include -#include -#include -#include - -namespace paddle_mobile { - -#ifdef PADDLE_MOBILE_FPGA - -namespace fpga { -int open_device(); -int close_device(); -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); - -// Usage: -// auto version = fpga::paddle_mobile_version(); -// std::cout << "0X0" << std::hex << version << std::endl; -uint32_t paddle_mobile_version(); -} // namespace fpga -#endif - -enum PaddleDType { - FLOAT32, - FLOAT16, - INT64, - INT8, - UINT8, -}; - -enum LayoutType { - LAYOUT_CHW = 1, - LAYOUT_HWC = 0, -}; - -class PaddleBuf { - public: - PaddleBuf() = default; - PaddleBuf(PaddleBuf&& other); - // Copy only available when memory is managed externally. - explicit PaddleBuf(const PaddleBuf&); - PaddleBuf& operator=(const PaddleBuf&); - // Do not own the memory. - PaddleBuf(void* data, size_t length) - : data_(data), length_(length), memory_owned_{false} {} - // Own memory. - explicit PaddleBuf(size_t length) - : data_(new char[length]), length_(length), memory_owned_(true) {} - // Resize to `length` bytes. - void Resize(size_t length); - // Reset to external memory. - void Reset(void* data, size_t length); - bool empty() const { return length_ == 0; } - void* data() const { return data_; } - size_t length() const { return length_; } - - ~PaddleBuf() { Free(); } - - private: - void Free(); - void* data_{nullptr}; // pointer to the data memory. - size_t length_{0}; // number of memory bytes. - bool memory_owned_{true}; -}; - -typedef enum { - paddle_void = 0, - paddle_float, - paddle_int, - paddle_uint16_t, - paddle_double, - paddle_int64_t, - paddle_size_t, - paddle_int16_t, - paddle_int8_t, - paddle_uint8_t, - paddle_bool, - paddle_string, - paddle_floats = 100, - paddle_ints, - paddle_int64_ts, - paddle_size_ts, - paddle_bools, - paddle_strings, - paddle_const_float = 200, - paddle_const_int, - paddle_block = 300, - paddle_tensor, - paddle_lod_tensor, - paddle_blocks, - paddle_tensors, - paddle_lod_tensors, - paddle_p_block = 400, - paddle_p_tensor, - paddle_p_lod_tensor, - paddle_p_blocks, - paddle_p_tensors, - paddle_p_lod_tensors, - paddle_scopes = 500, - paddle_selected_rows, - paddle_dim0 = 600, - paddle_dim1, - paddle_dim2, - paddle_dim3, - paddle_dim4, - paddle_dim5, - paddle_dim6, - paddle_dim7, - paddle_dim8, - paddle_dim9, -#ifdef PADDLE_MOBILE_CL - paddle_cl_image, -#endif -} PaddlekTypeId_t; - -struct PaddleTensor { - PaddleTensor() = default; - std::string name; // variable name. - std::vector shape; - std::vector lod; - PaddleBuf data; // blob of data. - PaddleDType dtype; - PaddlekTypeId_t dtypeid; - LayoutType layout; -}; - -enum class PaddleEngineKind { - kPaddleMobile, - // TODO(Superjomn) support following engines latter. - // kTensorRT, // Use TensorRT for inference. - // kAutoMixedAnakin, // Automatically mix Fluid with Anakin. - // kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. -}; - -/* - * A simple Inference API for Paddle. Currently this API can be used by - * non-sequence scenerios. - */ -class PaddlePredictor { - public: - struct Config; - PaddlePredictor(const PaddlePredictor&) = delete; - PaddlePredictor& operator=(const PaddlePredictor&) = delete; - - // Predict an record. - // The caller should be responsible for allocating and releasing the memory of - // `inputs`. `inputs` should be available until Run returns. Caller should be - // responsible for the output tensor's buffer, either allocated or passed from - // outside. - - virtual bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) = 0; - virtual std::string GetExceptionMsg() { return ""; } - // Destroy the Predictor. - virtual ~PaddlePredictor() = default; - - // The common configs for all the predictors. - struct Config { - std::string model_dir; // path to the model directory. - std::string prog_file; - std::string param_file; - }; -#ifdef PADDLE_MOBILE_FPGA - virtual void Predict_From_To(int start, int end) = 0; - virtual void FeedPaddleTensors(const std::vector& inputs) = 0; - virtual void FetchPaddleTensors(std::vector* outputs) = 0; - virtual void FetchPaddleTensors(PaddleTensor* outputs, int id) = 0; - virtual void GetPaddleTensor(const std::string& name, - PaddleTensor* output) = 0; -#else - virtual void Feed(const std::string& var_name, const PaddleTensor& input) = 0; - virtual void Fetch(const std::string& var_name, PaddleTensor* output) = 0; - virtual bool Run() = 0; -#endif - - protected: - PaddlePredictor() = default; -}; - -struct PaddleModelMemoryPack { - bool from_memory = false; - size_t model_size = 0; - uint8_t* model_buf = nullptr; - size_t combined_params_size = 0; - uint8_t* combined_params_buf = nullptr; -}; - -struct PaddleMobileConfig : public PaddlePredictor::Config { - enum Precision { FP32 = 0 }; - enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2, kGPU_CL = 3 }; - enum PrePostType { NONE_PRE_POST = 0, UINT8_255 = 1 }; - - enum Precision precision; - enum Device device; - enum PrePostType pre_post_type; - - int batch_size = 1; - bool optimize = true; - bool quantification = false; - int quantification_fold = 1; - bool lod_mode = false; - int thread_num = 1; - bool load_when_predict = false; - bool mem_opt = true; - std::string cl_path; - struct PaddleModelMemoryPack memory_pack; -}; - -// A factory to help create different predictors. -template -std::unique_ptr CreatePaddlePredictor(const ConfigT& config); - -} // namespace paddle_mobile diff --git a/mobile/src/io/paddle_mobile.cpp b/mobile/src/io/paddle_mobile.cpp deleted file mode 100644 index be69ce0f63..0000000000 --- a/mobile/src/io/paddle_mobile.cpp +++ /dev/null @@ -1,550 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "io/paddle_mobile.h" -#include -#include "common/common.h" -#ifdef _OPENMP -#include -#endif // _OPENMP -#ifdef PADDLE_MOBILE_CL -#include -#include // NOLINT -#include "framework/cl/cl_engine.h" -#include "framework/cl/cl_tensor.h" -#endif -#include "operators/math/gemm.h" - -namespace paddle_mobile { - -template -void PaddleMobile::SetThreadNum(int thread_num, - PowerMode power_mode) { - executor_->SetThreadNum(thread_num, power_mode); -} - -template -PMStatus PaddleMobile::Load(const std::string &dirname, - bool optimize, bool quantification, - int batch_size, bool lod_mode, - int quantification_fold) { - if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); - } else { - LOG(kLOG_INFO) << "loader inited"; - } - - if (executor_.get() == nullptr) { - executor_ = std::make_shared>( - loader_->Load(dirname, optimize, quantification, false, - quantification_fold), - config_, batch_size, optimize, lod_mode); - } else { - LOG(kLOG_INFO) << "executor inited"; - } - - return PMSuccess; -} - -template -PMStatus PaddleMobile::Load(const std::string &model_path, - const std::string ¶_path, - bool optimize, bool quantification, - int batch_size, bool lod_mode, - int quantification_fold) { - if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); - } else { - LOG(kLOG_INFO) << "loader inited"; - LOG(kLOG_INFO) << "loader inited"; - } - - if (executor_.get() == nullptr) { - executor_ = std::make_shared>( - loader_->Load(model_path, para_path, optimize, quantification, - quantification_fold), - config_, batch_size, optimize, lod_mode); - } else { - LOG(kLOG_INFO) << "executor inited"; - } - - return PMSuccess; -} - -template -PMStatus PaddleMobile::Load(const PaddleMobileConfig &config) { - if (!config.model_dir.empty()) { - return this->Load(config.model_dir, config.optimize, config.quantification, - config.batch_size, config.lod_mode, - config.quantification_fold); - } else if (!config.prog_file.empty() && !config.param_file.empty()) { - return this->Load(config.prog_file, config.param_file, config.optimize, - config.quantification, config.batch_size, config.lod_mode, - config.quantification_fold); - } else { - LOG(kLOG_ERROR) << "Failed to load inference model"; - return PMNotInitialized; - } -} - -template -bool PaddleMobile::LoadCombinedMemory( - size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - uint8_t *combined_params_buf, bool optimize, bool quantification, - int batch_size, bool lod_mode, int quantification_fold) { - if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); - } else { - LOG(kLOG_INFO) << "loader inited"; - } - if (executor_.get() == nullptr) { - executor_ = std::make_shared>( - loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len, - combined_params_buf, optimize, - quantification, quantification_fold), - config_, batch_size, optimize, lod_mode); - } else { - LOG(kLOG_INFO) << "executor inited"; - } - - return PMSuccess; -} - -template -PMStatus PaddleMobile::Predict(const framework::Tensor &input) { - std::vector> inputs; - inputs.push_back(std::make_pair("feed", input)); - return this->Predict(inputs); -} - -template -PMStatus PaddleMobile::Predict(const framework::LoDTensor &input) { - std::vector> inputs; - inputs.push_back(std::make_pair("feed", input)); - return this->Predict(inputs); -} - -template -PMStatus PaddleMobile::Predict( - const std::vector> &inputs) { - return executor_->Predict(inputs); -} - -template -PMStatus PaddleMobile::Predict( - const std::vector> &inputs) { - return executor_->Predict(inputs); -} - -template -std::vector PaddleMobile::Predict( - const std::vector &input, const std::vector &dims) { - return executor_->Predict(input, dims); -} - -template -PMStatus PaddleMobile::Predict() { - return executor_->Predict(); -} - -template -void PaddleMobile::Feed(const std::string &var_name, - const framework::Tensor &input) { - executor_->SetInput(input, var_name); -} - -template -void PaddleMobile::Feed(const std::string &var_name, - const framework::LoDTensor &input) { - executor_->SetInput(input, var_name); -} - -typedef std::shared_ptr LoDTensorPtr; -template -LoDTensorPtr PaddleMobile::Fetch(const std::string &var_name) { - return executor_->GetOutput(var_name); -} - -#ifdef PADDLE_MOBILE_CL -template -const framework::CLImage *PaddleMobile::FetchImage( - const std::string &var_name) { - return executor_->GetOutputImage(var_name); -} -#endif - -template -void PaddleMobile::Clear() { - executor_ = nullptr; - loader_ = nullptr; -} - -template -double PaddleMobile::GetPredictTime() {} - -template -std::string PaddleMobile::GetExceptionMsg() { - if (executor_.get() != nullptr) { - return executor_->GetExceptionMsg(); - } - return ""; -} - -#ifdef PADDLE_MOBILE_CPU -template <> -double PaddleMobile::GetPredictTime() { - int m = 32; - int n = 224 * 224; - int k = 27; - int lda = k; - int ldb = n; - int ldc = n; - float *a = - static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * k)); - float *b = - static_cast(paddle_mobile::memory::Alloc(sizeof(float) * k * n)); - float *c = - static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); - int t1 = 1; - int t2 = 1; - for (int i = 0; i < m * k; ++i) { - a[i] = t1 + rand() % t2; // NOLINT - } - for (int i = 0; i < k * n; ++i) { - b[i] = t1 + rand() % t2; // NOLINT - } - - operators::math::Gemm gemm; - auto time1 = paddle_mobile::time(); - int times = 4; - for (int j = 0; j < times; ++j) { - gemm.Sgemm(m, n, k, static_cast(1), a, lda, b, ldb, - static_cast(0), c, ldc, false, - static_cast(nullptr)); - } - - auto time2 = paddle_mobile::time(); - double cost = paddle_mobile::time_diff(time1, time2) / times; - paddle_mobile::memory::Free(a); - paddle_mobile::memory::Free(b); - paddle_mobile::memory::Free(c); - return cost; -} -#endif - -#ifdef PADDLE_MOBILE_FPGA -template -void PaddleMobile::InjectVariable(const framework::Tensor &t, - std::string var_name) { - executor_->InjectVariable(t, var_name); -} - -template -void PaddleMobile::FeedData(const framework::Tensor &t) { - executor_->FeedData(t); -} - -template -void PaddleMobile::FeedData(const std::vector &v) { - executor_->FeedData(v); -} -template -void PaddleMobile::FeedTensorData( - const std::vector &v) { - executor_->FeedTensorData(v); -} - -template -void PaddleMobile::GetResults(std::vector *v) { - executor_->GetResults(v); -} - -template -void PaddleMobile::GetTensorResults( - std::vector *v) { - executor_->GetTensorResults(v); -} - -template -framework::Tensor *PaddleMobile::GetTensorByName( - const std::string &name) { - return executor_->GetTensorByName(name); -} - -template -std::shared_ptr PaddleMobile::FetchResult( - int id) { - return executor_->FetchResult(id); -} - -template -void PaddleMobile::Predict_From_To(int start, int end) { - executor_->Predict_From_To(start, end); -} - -template -void PaddleMobile::Predict_From(int start) { - executor_->Predict_From(start); -} - -template -void PaddleMobile::Predict_To(int end) { - executor_->Predict_To(end); -} -#endif - -#ifdef PADDLE_MOBILE_CL -static std::mutex lc; -template -void PaddleMobile::SetCLPath(std::string path) { - std::lock_guard lock(lc); - if (framework::CLEngine::Instance()->GetCLPath() == "") { - framework::CLEngine::Instance()->setClPath(path); - } -} -template <> -double PaddleMobile::GetPredictTime() { - cl_int status; - if (!framework::CLEngine::Instance()->isInitSuccess()) { - return -1; - } - cl_context context = framework::CLEngine::Instance()->getContext(); - cl_command_queue queue = framework::CLEngine::Instance()->getClCommandQueue(); - - int n = 1; - int c = 3; - int h = 224; - int w = 224; - float *input = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * 3 * 224 * 224)); - float *filter = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * 32 * 27)); - int input_w = w * (c + 3) / 4; - int input_h = n * h; - int filter_w = 3 * (3 + 3) / 4; - int filter_h = 32 * 3; - int output_w = 224 * (32 + 3) / 4; - int output_h = 1 * 224; - - framework::DDim input_dims = {1, 3, 224, 224}; - framework::CLTensor input_cl_tensor(context, queue); - input_cl_tensor.Resize(input_dims); - cl_mem inputBuffer = input_cl_tensor.mutable_with_data(input); - - framework::DDim filter_dims = {32, 3, 3, 3}; - framework::CLTensor filter_cl_tensor(context, queue); - input_cl_tensor.Resize(filter_dims); - cl_mem filterBuffer = filter_cl_tensor.mutable_with_data(filter); - - cl_mem cl_filter_image = NULL; - cl_mem cl_input_image = NULL; - cl_mem cl_output_image = NULL; - cl_image_format cf = {.image_channel_order = CL_RGBA, - .image_channel_data_type = CL_HALF_FLOAT}; - cl_input_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, input_w, - input_h, 0, NULL, &status); - cl_filter_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, - filter_w, filter_h, 0, NULL, &status); - cl_output_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, - output_w, output_h, 0, NULL, &status); - char *code; - std::string path = framework::CLEngine::Instance()->GetCLPath() + - "/cl_kernel/feed_kernel.cl"; - size_t length = readText(path.c_str(), &code); - cl_program program = clCreateProgramWithSource( - context, 1, (const char **)&code, &length, NULL); - std::string path1 = "-cl-fast-relaxed-math -I " + - framework::CLEngine::Instance()->GetCLPath() + - "/cl_kernel"; - clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL); - cl_kernel kernel = clCreateKernel(program, "feed", &status); - - int out_H = 224; - int out_W = 224; - int out_C = 3; - int Stride2 = out_C * out_H * out_W; - int Stride1 = out_H * out_W; - int Stride0 = out_W; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); - CL_CHECK_ERRORS(status); - - size_t global_work_size[3] = {1, 224, 224}; - - // cl_event out_event = param.Out()->GetClEvent(); - - status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, - NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - - out_H = 3; - out_W = 3; - out_C = 3; - Stride2 = out_C * out_H * out_W; - Stride1 = out_H * out_W; - Stride0 = out_W; - - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); - CL_CHECK_ERRORS(status); - - size_t global_work_size1[3] = {1, 3, 96}; - - // cl_event out_event = param.Out()->GetClEvent(); - - status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size1, - NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - - clFinish(queue); - // queue = clCreateCommandQueue(context, listDevice[0], 0, &status); - - path = framework::CLEngine::Instance()->GetCLPath() + - "/cl_kernel/conv_kernel.cl"; - size_t length1 = readText(path.c_str(), &code); - program = clCreateProgramWithSource(context, 1, (const char **)&code, - &length1, &status); - CL_CHECK_ERRORS(status); - clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL); - kernel = clCreateKernel(program, "conv_3x3", &status); - CL_CHECK_ERRORS(status); - - int c_block = (32 + 3) / 4; - int nh = n * h; - int stride = 1; - int offset = 0; - int input_c = (c + 3) / 4; - int dilation = 1; - int input_width = 224; - int input_height = 224; - int output_width = 224; - int output_height = 224; - int has_group = 0; - int filter_channel = 3; - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &cl_filter_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &cl_output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 14, sizeof(int), &filter_channel); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 15, sizeof(int), &has_group); - CL_CHECK_ERRORS(status); - - // cl_event out_event = param.Output()->GetClEvent(); - // cl_event wait_event = param.Input()->GetClEvent(); - size_t global_work_size2[3] = {8, 224, 224}; - auto time1 = paddle_mobile::time(); - int times = 10; - for (int i = 0; i < times; ++i) { - status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2, - NULL, 0, NULL, NULL); - } - CL_CHECK_ERRORS(status); - clFinish(queue); - auto time2 = paddle_mobile::time(); - paddle_mobile::memory::Free(input); - paddle_mobile::memory::Free(filter); - if (status == CL_SUCCESS) { - return paddle_mobile::time_diff(time1, time2) / times; - } else { - return -1; - } -} -template -int PaddleMobile::readText( - const char *kernelPath, - char **pcode) { // 读取文本文件放入 pcode,返回字符串长度 - FILE *fp; - int size; - // printf(" File: %s\n", kernelPath); - fp = fopen(kernelPath, "rb"); - if (!fp) { - printf(" Open file failed\n"); - return -1; - } - if (fseek(fp, 0, SEEK_END) != 0) { - printf(" Seek end of file failed\n"); - return -1; - } - if ((size = ftell(fp)) < 0) { - printf(" Get file position failed\n"); - return -1; - } - rewind(fp); - if ((*pcode = reinterpret_cast(malloc(size + 1))) == NULL) { - printf(" Allocate space failed\n"); - return -1; - } - fread(*pcode, 1, size, fp); - (*pcode)[size] = '\0'; - fclose(fp); - return size + 1; -} -#endif - -template class PaddleMobile; -template class PaddleMobile; -template class PaddleMobile; - -} // namespace paddle_mobile diff --git a/mobile/src/io/paddle_mobile.h b/mobile/src/io/paddle_mobile.h deleted file mode 100644 index 8c40b0696a..0000000000 --- a/mobile/src/io/paddle_mobile.h +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include "common/types.h" -#include "framework/executor.h" -#include "framework/load_ops.h" -#include "framework/loader.h" -#include "framework/tensor.h" -#include "io/paddle_inference_api.h" -#ifdef PADDLE_MOBILE_CL -#include "framework/cl/cl_engine.h" -#include "io/opencl_interface.h" -#endif - -namespace paddle_mobile { - -template -class PaddleMobile { - public: - explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) { - bool is_gpu = std::is_same, Device>::value; -#ifndef PADDLE_MOBILE_CL - PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on"); -#else - if (is_gpu) { - prepareOpenclRuntime(); - } -#endif - } - - PaddleMobile() { - bool is_gpu = std::is_same, Device>::value; -#ifndef PADDLE_MOBILE_CL - PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on"); -#else - if (is_gpu) { // recheck when run cpu in with opencl. - prepareOpenclRuntime(); - } -#endif - } - virtual ~PaddleMobile() { Clear(); } - - PMStatus Load(const std::string &dirname, const bool optimize = false, - const bool quantification = false, const int batch_size = 1, - const bool lod_mode = false, const int quantification_fold = 1); - PMStatus Load(const std::string &model_path, const std::string ¶_path, - const bool optimize = false, const bool quantification = false, - const int batch_size = 1, const bool lod_mode = false, - const int quantification_fold = 1); - - PMStatus Load(const PaddleMobileConfig &config); - - PMStatus Predict(const framework::Tensor &input); - PMStatus Predict(const framework::LoDTensor &input); - - PMStatus Predict( - const std::vector> &inputs); - PMStatus Predict( - const std::vector> &inputs); - - std::vector Predict(const std::vector &input, - const std::vector &dims); - PMStatus Predict(); - - void Feed(const std::string &var_name, const framework::LoDTensor &input); - void Feed(const std::string &var_name, const framework::Tensor &input); - - typedef std::shared_ptr LoDTensorPtr; - LoDTensorPtr Fetch(const std::string &var_name); -#ifdef PADDLE_MOBILE_CL - const framework::CLImage *FetchImage(const std::string &var_name); -#endif - - LoDTensorPtr Fetch() { return Fetch("fetch"); } - - bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, - size_t combined_params_len, - uint8_t *combined_params_buf, bool optimize = false, - bool quantification = false, int batch_size = 1, - bool lod_mode = false, int quantification_fold = 1); - - void SetThreadNum(int thread_num, - PowerMode power_mode = PERFORMANCE_PRIORITY); - void Clear(); - double GetPredictTime(); - std::string GetExceptionMsg(); - -#ifdef PADDLE_MOBILE_FPGA - void InjectVariable(const framework::Tensor &t, std::string var_name); - void FeedData(const framework::Tensor &t); - void FeedData(const std::vector &v); - void FeedTensorData(const std::vector &v); - - void GetResults(std::vector *v); - void GetTensorResults(std::vector *v); - framework::Tensor *GetTensorByName(const std::string &name); - - std::shared_ptr FetchResult(int id = -1); - void Predict_From_To(int start = 0, int end = -1); - void Predict_From(int start); - void Predict_To(int end); -#endif - -#ifdef PADDLE_MOBILE_CL - public: // NOLINT - void SetCLPath(std::string cl_path); - int readText(const char *kernelPath, - char **pcode); // 读取文本文件放入 pcode,返回字符串长度 -#endif - - private: - std::shared_ptr> loader_; - std::shared_ptr> executor_; - PaddleMobileConfigInternal config_; -}; - -} // namespace paddle_mobile diff --git a/mobile/src/io/paddle_mobile_wrap.cpp b/mobile/src/io/paddle_mobile_wrap.cpp deleted file mode 100644 index b8fd3097e2..0000000000 --- a/mobile/src/io/paddle_mobile_wrap.cpp +++ /dev/null @@ -1,361 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "io/paddle_mobile_wrap.h" - -#include "io/api_paddle_mobile.h" -#include "io/paddle_mobile.h" - -namespace paddle_mobile { -namespace wrap { - -#ifndef PADDLE_MOBILE_FPGA - -// ddim class -int DDim::size() { return dims.size(); } - -int64_t &DDim::operator[](int idx) { - if (0 <= idx && idx < dims.size()) { - return dims[idx]; - } - int64_t non_exist = 0; - return non_exist; -} - -int64_t DDim::operator[](int idx) const { - if (0 <= idx && idx < dims.size()) { - return dims[idx]; - } - return 0; -} - -DDim make_ddim(const std::vector &dims) { - DDim ddim; - for (auto dim : dims) { - ddim.dims.push_back(dim); - } - return ddim; -} - -// tensor class - -Tensor::Tensor(float *data, DDim ddim) { - this->data_ = data; - this->ddim_ = ddim; -} - -float *Tensor::data() const { return this->data_; } - -DDim Tensor::dims() const { return this->ddim_; } - -// net class - -void Net::SetThreadNum(int threads) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - engine->SetThreadNum(threads); - } - } -} - -void Net::SetCLPath(std::string path) { -#ifdef PADDLE_MOBILE_CL - if (this->device_ == kGPU_CL) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - engine->SetCLPath(path); - } -#endif -} - -bool Net::Load(const std::string &dirname, const bool optimize, - const bool quantification, const int batch_size, - const bool lod_mode) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - paddle_mobile::PMStatus status = - engine->Load(dirname, optimize, quantification, batch_size, lod_mode); - return status == paddle_mobile::PMSuccess; - } - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - paddle_mobile::PMStatus status = - engine->Load(dirname, optimize, quantification, batch_size, lod_mode); - return status == paddle_mobile::PMSuccess; - } -#else - return false; -#endif - } - return false; -} - -bool Net::Load(const std::string &model_path, const std::string ¶_path, - const bool optimize, const bool quantification, - const int batch_size, const bool lod_mode) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - paddle_mobile::PMStatus status = - engine->Load(model_path, para_path, optimize, quantification, - batch_size, lod_mode); - return status == paddle_mobile::PMSuccess; - } - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - paddle_mobile::PMStatus status = - engine->Load(model_path, para_path, optimize, quantification, - batch_size, lod_mode); - return status == paddle_mobile::PMSuccess; - } -#else - return false; -#endif - } - return false; -} - -bool Net::LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, - size_t combined_params_len, - uint8_t *combined_params_buf, bool optimize, - bool quantification, int batch_size, - bool lod_mode) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - bool status = engine->LoadCombinedMemory( - model_len, model_buf, combined_params_len, combined_params_buf, - optimize, quantification, batch_size, lod_mode); - return status; - } - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - bool status = engine->LoadCombinedMemory( - model_len, model_buf, combined_params_len, combined_params_buf, - optimize, quantification, batch_size, lod_mode); - return status; - } -#else - return false; -#endif - } - return false; -} - -std::vector Net::Predict(const std::vector &input, - const std::vector &dims) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - auto result = engine->Predict(input, dims); - return result; - } - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - auto result = engine->Predict(input, dims); - return result; - } -#else - return std::vector(); -#endif - } - return std::vector(); -} - -bool Net::Predict() { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - paddle_mobile::PMStatus status = engine->Predict(); - return status == paddle_mobile::PMSuccess; - } - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - paddle_mobile::PMStatus status = engine->Predict(); - return status == paddle_mobile::PMSuccess; - } -#else - return false; -#endif - } - return false; -} - -bool Net::Predict(const Tensor &input) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - auto input_data = input.data(); - auto input_dims = input.dims(); - std::vector input_dims_as_vector = input_dims.dims; - paddle_mobile::framework::Tensor input_inner( - input_data, - paddle_mobile::framework::make_ddim(input_dims_as_vector)); - paddle_mobile::PMStatus status = engine->Predict(input_inner); - return status == paddle_mobile::PMSuccess; - } - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - auto input_data = input.data(); - auto input_dims = input.dims(); - std::vector input_dims_as_vector = input_dims.dims; - paddle_mobile::framework::Tensor input_inner( - input_data, - paddle_mobile::framework::make_ddim(input_dims_as_vector)); - paddle_mobile::PMStatus status = engine->Predict(input_inner); - return status == paddle_mobile::PMSuccess; - } -#else - return false; -#endif - } - return false; -} - -void Net::Feed(const std::string &var_name, const Tensor &input) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - auto input_data = input.data(); - auto input_dims = input.dims(); - std::vector input_dims_as_vector = input_dims.dims; - paddle_mobile::framework::Tensor input_inner( - input_data, - paddle_mobile::framework::make_ddim(input_dims_as_vector)); - engine->Feed(var_name, input_inner); - } - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - auto input_data = input.data(); - auto input_dims = input.dims(); - std::vector input_dims_as_vector = input_dims.dims; - paddle_mobile::framework::Tensor input_inner( - input_data, - paddle_mobile::framework::make_ddim(input_dims_as_vector)); - engine->Feed(var_name, input_inner); - } -#else - return; -#endif - } -} - -std::shared_ptr Net::Fetch(const std::string &var_name) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - auto output_inner = engine->Fetch(var_name); - auto ddim_inner = output_inner->dims(); - std::vector ddim_as_vector; - for (int i = 0; i < ddim_inner.size(); i++) { - ddim_as_vector.push_back(ddim_inner[i]); - } - auto ddim = make_ddim(ddim_as_vector); - auto output_data = output_inner->data(); - std::shared_ptr ptr(new Tensor(output_data, ddim)); - return ptr; - } - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - if (engine != nullptr) { - auto output_inner = engine->Fetch(var_name); - auto ddim_inner = output_inner->dims(); - std::vector ddim_as_vector; - for (int i = 0; i < ddim_inner.size(); i++) { - ddim_as_vector.push_back(ddim_inner[i]); - } - auto ddim = make_ddim(ddim_as_vector); - auto output_data = output_inner->data(); - std::shared_ptr ptr(new Tensor(output_data, ddim)); - return ptr; - } -#else - return nullptr; -#endif - } - return nullptr; -} - -Net::Net(DeviceTypeEnum device) { - if (this->engine_ == nullptr) { - PaddleMobileConfigInternal config; - this->device_ = device; - if (this->device_ == kCPU) { - this->engine_ = - new paddle_mobile::PaddleMobile(config); - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - this->engine_ = - new paddle_mobile::PaddleMobile(config); -#endif - } - } -} - -Net::~Net() { - if (this->engine_ != nullptr) { - if (this->device_ == kCPU) { - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - delete engine; - this->engine_ = nullptr; - } else if (this->device_ == kGPU_CL) { -#ifdef PADDLE_MOBILE_CL - auto engine = - (paddle_mobile::PaddleMobile *)this->engine_; - delete engine; - this->engine_ = nullptr; -#endif - } - } -} - -#endif - -} // namespace wrap -} // namespace paddle_mobile diff --git a/mobile/src/io/paddle_mobile_wrap.h b/mobile/src/io/paddle_mobile_wrap.h deleted file mode 100644 index 28c954dbc7..0000000000 --- a/mobile/src/io/paddle_mobile_wrap.h +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -namespace paddle_mobile { -namespace wrap { - -#ifndef PADDLE_MOBILE_FPGA - -// device type -__attribute__((__visibility__("default"))) enum DeviceTypeEnum { - kCPU = 0, - kGPU_CL = 1 -}; - -// ddim class -class DDim { - public: - __attribute__((__visibility__("default"))) int size(); - __attribute__((__visibility__("default"))) int64_t &operator[](int idx); - __attribute__((__visibility__("default"))) int64_t operator[](int idx) const; - - __attribute__((__visibility__("default"))) std::vector dims; -}; -__attribute__((__visibility__("default"))) DDim make_ddim( - const std::vector &dims); - -// tensor class -class Tensor { - public: - __attribute__((__visibility__("default"))) Tensor(float *data, DDim ddim); - - __attribute__((__visibility__("default"))) float *data() const; - __attribute__((__visibility__("default"))) DDim dims() const; - - private: - float *data_; - DDim ddim_; -}; - -// net class -class Net { - public: - __attribute__((__visibility__("default"))) Net(DeviceTypeEnum device); - __attribute__((__visibility__("default"))) ~Net(); - __attribute__((__visibility__("default"))) void SetThreadNum(int thread_num); - __attribute__((__visibility__("default"))) void SetCLPath(std::string path); - __attribute__((__visibility__("default"))) bool Load( - const std::string &dirname, const bool optimize = false, - const bool quantification = false, const int batch_size = 1, - const bool lod_mode = false); - __attribute__((__visibility__("default"))) bool Load( - const std::string &model_path, const std::string ¶_path, - const bool optimize = false, const bool quantification = false, - const int batch_size = 1, const bool lod_mode = false); - __attribute__((__visibility__("default"))) bool LoadCombinedMemory( - size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - uint8_t *combined_params_buf, bool optimize = false, - bool quantification = false, int batch_size = 1, bool lod_mode = false); - __attribute__((__visibility__("default"))) std::vector Predict( - const std::vector &input, const std::vector &dims); - __attribute__((__visibility__("default"))) bool Predict(); - __attribute__((__visibility__("default"))) bool Predict(const Tensor &input); - __attribute__((__visibility__("default"))) void Feed( - const std::string &var_name, const Tensor &input); - __attribute__((__visibility__("default"))) std::shared_ptr Fetch( - const std::string &var_name); - - private: - void *engine_ = nullptr; - DeviceTypeEnum device_; -}; - -#endif - -} // namespace wrap -} // namespace paddle_mobile diff --git a/mobile/src/io/paddle_test_inference_api.cpp b/mobile/src/io/paddle_test_inference_api.cpp deleted file mode 100644 index d0c6c48c20..0000000000 --- a/mobile/src/io/paddle_test_inference_api.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "io/paddle_test_inference_api.h" -#include "io/paddle_mobile.h" - -namespace paddle_mobile { - -template -double PaddleTester::CaculatePredictTime(std::string *cl_path) { - PaddleMobile paddle_mobile; -#ifdef PADDLE_MOBILE_CL - if (cl_path) { - paddle_mobile.SetCLPath(*cl_path); - } - -#endif - return paddle_mobile.GetPredictTime(); -} -template class PaddleTester; -template class PaddleTester; - -template class PaddleTester; - -} // namespace paddle_mobile diff --git a/mobile/src/io/paddle_test_inference_api.h b/mobile/src/io/paddle_test_inference_api.h deleted file mode 100644 index 47680a49da..0000000000 --- a/mobile/src/io/paddle_test_inference_api.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file contains the definition of a simple Inference API for Paddle. - * - * ATTENTION: It requires some C++ features, for lower version C++ or C, we - * might release another API. - */ - -#pragma once - -#include "common/types.h" -#include "string" - -namespace paddle_mobile { - -template -class PaddleTester { - public: - double CaculatePredictTime(std::string *cl_path = nullptr); -}; - -} // namespace paddle_mobile diff --git a/mobile/src/memory/t_malloc.cpp b/mobile/src/memory/t_malloc.cpp deleted file mode 100755 index f48a75d3f6..0000000000 --- a/mobile/src/memory/t_malloc.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "memory/t_malloc.h" -#include -#include - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif - -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -#ifdef PADDLE_MOBILE_FPGA_KD -#include "fpga/KD/llapi/zynqmp_api.h" -#endif - -namespace paddle_mobile { -namespace memory { -const int MALLOC_ALIGN = 64; - -#ifdef PADDLE_MOBILE_FPGA -namespace fpga = paddle_mobile::fpga; - -void Copy(void *dst, const void *src, size_t num) { - fpga::fpga_copy(dst, src, num); -} - -void *Alloc(size_t size) { return fpga::fpga_malloc(size); } - -void Free(void *ptr) { - if (ptr) { - fpga::fpga_free(ptr); - } -} - -#elif defined(PADDLE_MOBILE_FPGA_KD) - -void Copy(void *dst, const void *src, size_t num) { - std::memcpy(dst, src, num); -} - -void *Alloc(size_t size) { return zynqmp::fpga_malloc(size); } - -void Free(void *ptr) { - if (ptr) { - zynqmp::fpga_free(ptr); - } -} -#else - -void Copy(void *dst, const void *src, size_t num) { - std::memcpy(dst, src, num); -} - -void *Alloc(size_t size) { - // segmentation fault if size_t overflow on 32-bit platforms - // user should check before calling this function - size_t offset = sizeof(void *) + MALLOC_ALIGN - 1; - char *p = static_cast(malloc(offset + size)); - if (!p) { - return nullptr; - } - void *r = reinterpret_cast(reinterpret_cast(p + offset) & - (~(MALLOC_ALIGN - 1))); - static_cast(r)[-1] = p; - return r; -} - -void Free(void *ptr) { - if (ptr) { - free(static_cast(ptr)[-1]); - } -} - -#endif - -} // namespace memory -} // namespace paddle_mobile diff --git a/mobile/src/memory/t_malloc.h b/mobile/src/memory/t_malloc.h deleted file mode 100644 index b57403b515..0000000000 --- a/mobile/src/memory/t_malloc.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -namespace paddle_mobile { -namespace memory { - -void Copy(void *dst, const void *src, size_t num); - -void *Alloc(size_t size); - -void Free(void *ptr); - -/** - * \brief Free memory block in one place. - * - * \note In some cases, custom deleter is used to - * deallocate the memory automatically for - * std::unique_ptr in tensor.h. - * static_cast - */ -template -class PODDeleter { - static_assert(std::is_pod::value, "T must be POD"); - - public: - explicit PODDeleter(){}; - - void operator()(T *ptr) { Free(static_cast(ptr)); } -}; - -/** - * \brief Free memory block in one place does not meet POD - * - * \note In some cases, custom deleter is used to - * deallocate the memory automatically for - * std::unique_ptr in tensor.h. - * reinterpret_cast - */ -template -class PlainDeleter { - public: - explicit PlainDeleter(){}; - - void operator()(T *ptr) { Free(reinterpret_cast(ptr)); } -}; -} // namespace memory -} // namespace paddle_mobile diff --git a/mobile/src/operators/activation_op.cpp b/mobile/src/operators/activation_op.cpp deleted file mode 100755 index 905b881fee..0000000000 --- a/mobile/src/operators/activation_op.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/activation_op.h" - -namespace paddle_mobile { -namespace operators { - -#define DEFINE_ACTIVATION_INFERSHAPE(OpName) \ - template \ - void OpName##Op::InferShape() const { \ - const auto &input_dims = this->param_.InputX()->dims(); \ - this->param_.Out()->Resize(input_dims); \ - } - -#ifdef RELU_OP -DEFINE_ACTIVATION_INFERSHAPE(Relu); -DEFINE_ACTIVATION_INFERSHAPE(Relu6); -#endif // RELU_OP - -#ifdef SIGMOID_OP -DEFINE_ACTIVATION_INFERSHAPE(Sigmoid); -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(sigmoid, ops::SigmoidOp); -#endif -#endif // SIGMOID_OP - -#ifdef TANH_OP -DEFINE_ACTIVATION_INFERSHAPE(Tanh); -#endif // TANH_OP - -#ifdef LOG_OP -DEFINE_ACTIVATION_INFERSHAPE(Log); -#endif // LOG_OP - -#ifdef LEAKY_RELU_OP -DEFINE_ACTIVATION_INFERSHAPE(LeakyRelu); -#endif // LEAKY_RELU_OP - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef RELU_OP -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(relu, ops::ReluOp); -REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(relu, ops::ReluOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(relu, ops::ReluOp); -REGISTER_OPERATOR_CL(relu6, ops::Relu6Op); -#endif -#endif // RELU_OP - -#ifdef SIGMOID_OP -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(sigmoid, ops::SigmoidOp); -#endif -#endif // SIGMOID_OP - -#ifdef TANH_OP -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(tanh, ops::TanhOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(tanh, ops::TanhOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp); -#endif -#endif // TANH_OP - -#ifdef PADDLE_MOBILE_CPU -#ifdef LOG_OP -REGISTER_OPERATOR_CPU(log, ops::LogOp); -#endif // LOG_OP -#endif - -#ifdef LEAKY_RELU_OP -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(leaky_relu, ops::LeakyReluOp); -#endif // LEAKY_RELU_OP - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(leaky_relu, ops::LeakyReluOp); -#endif -#endif diff --git a/mobile/src/operators/activation_op.h b/mobile/src/operators/activation_op.h deleted file mode 100644 index cd250080e5..0000000000 --- a/mobile/src/operators/activation_op.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/activation_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef RELU_OP -DECLARE_OPERATOR(Relu, ReluParam, ReluKernel); -DECLARE_OPERATOR(Relu6, Relu6Param, Relu6Kernel); -#endif - -#ifdef SIGMOID_OP -DECLARE_OPERATOR(Sigmoid, SigmoidParam, SigmoidKernel); -#endif - -#ifdef TANH_OP -DECLARE_OPERATOR(Tanh, TanhParam, TanhKernel); -#endif - -#ifdef LOG_OP -DECLARE_OPERATOR(Log, ReluParam, LogKernel); -#endif - -#ifdef LEAKY_RELU_OP -DECLARE_OPERATOR(LeakyRelu, LeakyReluParam, LeakyReluKernel); -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/assign_op.cpp b/mobile/src/operators/assign_op.cpp deleted file mode 100644 index adc038a223..0000000000 --- a/mobile/src/operators/assign_op.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ASSIGN_OP - -#include "operators/assign_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void AssignOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr, - "Input (X) of Assign op should not be null."); - PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr, - "Output (Output) of Assign op should not be null."); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(assign, ops::AssignOp); -#endif - -#endif // ASSIGN_OP diff --git a/mobile/src/operators/assign_op.h b/mobile/src/operators/assign_op.h deleted file mode 100644 index 478330bc3b..0000000000 --- a/mobile/src/operators/assign_op.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ASSIGN_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/assign_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(Assign, AssignParam, AssignKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/assign_value_op.cpp b/mobile/src/operators/assign_value_op.cpp deleted file mode 100644 index 5100c2246b..0000000000 --- a/mobile/src/operators/assign_value_op.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ASSIGN_VALUE_OP - -#include "operators/assign_value_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void AssignValueOp::InferShape() const { - const auto &shape = this->param_.shape_; - this->param_.output_->Resize(framework::make_ddim(shape)); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(assign_value, ops::AssignValueOp); -#endif - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(assign_value, ops::AssignValueOp); -#endif - -#endif // ASSIGN_VALUE_OP diff --git a/mobile/src/operators/assign_value_op.h b/mobile/src/operators/assign_value_op.h deleted file mode 100644 index ce319d333a..0000000000 --- a/mobile/src/operators/assign_value_op.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ASSIGN_VALUE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/assign_value_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(AssignValue, AssignValueParam, AssignValueKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/batchnorm_op.cpp b/mobile/src/operators/batchnorm_op.cpp deleted file mode 100644 index 3a272845cc..0000000000 --- a/mobile/src/operators/batchnorm_op.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BATCHNORM_OP - -#include "operators/batchnorm_op.h" -#include "framework/op_proto_maker.h" -#include "framework/op_registry.h" - -namespace paddle_mobile { -namespace operators { - -template -void BatchNormOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - this->param_.OutputY()->Resize(x_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -#endif - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(batch_norm, ops::BatchNormOp); -#endif - -#endif diff --git a/mobile/src/operators/batchnorm_op.h b/mobile/src/operators/batchnorm_op.h deleted file mode 100644 index ed46c8657f..0000000000 --- a/mobile/src/operators/batchnorm_op.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BATCHNORM_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/batchnorm_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class BatchNormOp - : public framework::OperatorWithKernel, - BatchNormKernel> { - public: - BatchNormOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - BatchNormKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/beam_search_decode_op.cpp b/mobile/src/operators/beam_search_decode_op.cpp deleted file mode 100644 index 1038234fe8..0000000000 --- a/mobile/src/operators/beam_search_decode_op.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BEAM_SEARCH_DECODE_OP - -#include "operators/beam_search_decode_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void BeamSearchDecodeOp::InferShape() const {} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(beam_search_decode, ops::BeamSearchDecodeOp); -#endif - -#endif // BEAM_SEARCH_DECODE_OP diff --git a/mobile/src/operators/beam_search_decode_op.h b/mobile/src/operators/beam_search_decode_op.h deleted file mode 100644 index f212959474..0000000000 --- a/mobile/src/operators/beam_search_decode_op.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BEAM_SEARCH_DECODE_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/beam_search_decode_kernel.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(BeamSearchDecode, BeamSearchDecodeParam, - BeamSearchDecodeKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif // BEAM_SEARCH_DECODE_OP diff --git a/mobile/src/operators/beam_search_op.cpp b/mobile/src/operators/beam_search_op.cpp deleted file mode 100644 index 5f83e53667..0000000000 --- a/mobile/src/operators/beam_search_op.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BEAM_SEARCH_OP - -#include "operators/beam_search_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void BeamSearchOp::InferShape() const {} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(beam_search, ops::BeamSearchOp); -#endif - -#endif // BEAM_SEARCH_OP diff --git a/mobile/src/operators/beam_search_op.h b/mobile/src/operators/beam_search_op.h deleted file mode 100644 index 985552d9f6..0000000000 --- a/mobile/src/operators/beam_search_op.h +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BEAM_SEARCH_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/beam_search_kernel.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(BeamSearch, BeamSearchParam, BeamSearchKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif // BEAM_SEARCH_OP diff --git a/mobile/src/operators/bilinear_interp_op.cpp b/mobile/src/operators/bilinear_interp_op.cpp deleted file mode 100644 index ef5d230873..0000000000 --- a/mobile/src/operators/bilinear_interp_op.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BILINEAR_INTERP_OP - -#include "operators/bilinear_interp_op.h" -#include - -namespace paddle_mobile { -namespace operators { -template -void BilinearOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr, - "Input(X) of BilinearInterOp should not be null."); - PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr, - "Output(Out) of BilinearInterOp should not be null."); - - auto dim_x = this->param_.InputX()->dims(); // NCHW format - int out_h = this->param_.OutH(); - int out_w = this->param_.OutW(); - PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4"); - bool ignore_scale = false; - if (out_h > 0 && out_w > 0) { - ignore_scale = true; - } - if (this->param_.InputOutPutSize() != nullptr) { - auto out_size_dim = this->param_.InputOutPutSize()->dims(); - - PADDLE_MOBILE_ENFORCE(out_size_dim.size() == 1, - "OutSize's dimension size must be 1"); - PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2"); - } - - if (this->param_.HasScale() && !ignore_scale) { - const float scale = this->param_.Scale(); - DLOG << "scale_: " << scale; - std::vector dim_out({dim_x[0], dim_x[1], - static_cast(dim_x[2] * scale), - static_cast(dim_x[3] * scale)}); - this->param_.Out()->Resize(framework::make_ddim(dim_out)); - DLOG << "interp -- dim_out: " << dim_out; - - } else { - std::vector dim_out({dim_x[0], dim_x[1], out_h, out_w}); - this->param_.Out()->Resize(framework::make_ddim(dim_out)); - DLOG << "interp -- dim_out: " << dim_out; - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(bilinear_interp, ops::BilinearOp); -#endif - -#if PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(bilinear_interp, ops::BilinearOp) -#endif - -#ifdef PADDLE_MOBILE_FPGA -#endif - -#endif diff --git a/mobile/src/operators/bilinear_interp_op.h b/mobile/src/operators/bilinear_interp_op.h deleted file mode 100644 index 2fee40859b..0000000000 --- a/mobile/src/operators/bilinear_interp_op.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BILINEAR_INTERP_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/bilinear_interp_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class BilinearOp : public framework::OperatorWithKernel< - DeviceType, BilinearInterpParam, - operators::BilinearInterpKernel> { - public: - BilinearOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, BilinearInterpParam, - operators::BilinearInterpKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/box_coder_op.cpp b/mobile/src/operators/box_coder_op.cpp deleted file mode 100644 index 6511266e68..0000000000 --- a/mobile/src/operators/box_coder_op.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BOXCODER_OP - -#include "operators/box_coder_op.h" -#include -namespace paddle_mobile { -namespace operators { - -template -void BoxCoderOp::InferShape() const { - auto input_priorbox_dims = this->param_.InputPriorBox()->dims(); - auto input_priorboxvar_dims = this->param_.InputPriorBoxVar()->dims(); - auto input_targetbox_dims = this->param_.InputTargetBox()->dims(); - - auto code_type = this->param_.CodeType(); - - if (code_type == "encode_center_size") { - if (input_targetbox_dims.size() != 2) { - LOG(kLOG_ERROR) << " The rank of Input of TargetBox must be 2"; - } - if (input_targetbox_dims[1] != 4) { - LOG(kLOG_ERROR) << " The shape of TargetBox is [M, 4]"; - } - } - if (code_type == "decode_center_size") { - if (input_targetbox_dims.size() != 3) { - LOG(kLOG_ERROR) << "The rank of Input of TargetBox must be 3"; - } - if (input_targetbox_dims[1] != input_priorbox_dims[0] || - input_targetbox_dims[2] != input_priorbox_dims[1]) { - LOG(kLOG_ERROR) << " dimension not match"; - } - } - this->param_.OutputBox()->Resize(framework::make_ddim( - {input_targetbox_dims[0], input_priorbox_dims[0], 4})); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(box_coder, ops::BoxCoderOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -#endif - -#endif diff --git a/mobile/src/operators/box_coder_op.h b/mobile/src/operators/box_coder_op.h deleted file mode 100644 index 417783ca93..0000000000 --- a/mobile/src/operators/box_coder_op.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BOXCODER_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/box_coder_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class BoxCoderOp : public framework::OperatorWithKernel< - DeviceType, BoxCoderParam, - operators::BoxCoderKernel> { - public: - BoxCoderOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::BoxCoderKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/cast_op.cpp b/mobile/src/operators/cast_op.cpp deleted file mode 100644 index 70a3ff6646..0000000000 --- a/mobile/src/operators/cast_op.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CAST_OP - -#include "operators/cast_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void CastOp::InferShape() const { - const auto &dims = this->param_.input_->dims(); - this->param_.output_->Resize(dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(cast, ops::CastOp); -#endif - -#endif // CAST_OP diff --git a/mobile/src/operators/cast_op.h b/mobile/src/operators/cast_op.h deleted file mode 100644 index a244d5cfaf..0000000000 --- a/mobile/src/operators/cast_op.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CAST_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/kernels.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class CastOp : public framework::OperatorWithKernel< - DeviceType, CastParam, - operators::CastKernel> { - public: - CastOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::CastKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // CAST_OP diff --git a/mobile/src/operators/compare_op.cpp b/mobile/src/operators/compare_op.cpp deleted file mode 100644 index 7332e33c62..0000000000 --- a/mobile/src/operators/compare_op.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/compare_op.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef LESS_THAN_OP -template -void LessThanOp::InferShape() const { - const auto &input_dims = this->param_.input_x_->dims(); - this->param_.output_->Resize(input_dims); -} -#endif // LESS_THAN_OP - -#ifdef EQUAL_OP -template -void EqualOp::InferShape() const { - const auto &input_dims = this->param_.input_x_->dims(); - this->param_.output_->Resize(input_dims); -} -#endif // EQUAL_OP - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef LESS_THAN_OP -REGISTER_OPERATOR_CPU(less_than, ops::LessThanOp); -#endif // LESS_THAN_OP -#ifdef EQUAL_OP -REGISTER_OPERATOR_CPU(equal, ops::EqualOp); -#endif // EQUAL_OP diff --git a/mobile/src/operators/compare_op.h b/mobile/src/operators/compare_op.h deleted file mode 100644 index 5fbc350053..0000000000 --- a/mobile/src/operators/compare_op.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/compare_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef LESS_THAN_OP -DECLARE_OPERATOR(LessThan, CompareParam, LessThanKernel); -#endif // LESS_THAN_OP - -#ifdef EQUAL_OP -DECLARE_OPERATOR(Equal, CompareParam, EqualKernel); -#endif // EQUAL_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/concat_op.cpp b/mobile/src/operators/concat_op.cpp deleted file mode 100644 index 3f026a91ef..0000000000 --- a/mobile/src/operators/concat_op.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONCAT_OP - -#include - -#include "operators/concat_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void ConcatOp::InferShape() const { - auto inputs = this->param_.Inputs(); - const size_t n = inputs.size(); - - std::vector inputs_dims; - inputs_dims.reserve(n); - for (int i = 0; i < n; i++) { - inputs_dims.push_back(inputs[i]->dims()); - } - - if (n == 1) { - DLOG << "Warning: concat op have only one input, " - "may waste memory"; - } - - /// add all dim[axis] and check other dims if equal. - auto out_dims = inputs_dims[0]; - auto axis = static_cast(this->param_.Axis()) - - (this->param_.original_output_dims_size_ - out_dims.size()); - int in_zero_dims_size = out_dims.size(); - for (size_t i = 1; i < n; i++) { - for (size_t j = 0; j < in_zero_dims_size; j++) { - if (j == axis) { - out_dims[axis] += inputs_dims[i][j]; - } else { - assert(out_dims[j] == inputs_dims[i][j]); - } - } - } - - if (out_dims[axis] < 0) { - out_dims[axis] = -1; - } - - this->param_.Out()->Resize(out_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(concat, ops::ConcatOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(concat, ops::ConcatOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp); -#endif - -#endif diff --git a/mobile/src/operators/concat_op.h b/mobile/src/operators/concat_op.h deleted file mode 100644 index 94c402cd85..0000000000 --- a/mobile/src/operators/concat_op.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONCAT_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/concat_kernel.h" -#include "operators/op_param.h" -namespace paddle_mobile { -namespace operators { -using std::string; -template -class ConcatOp : public framework::OperatorWithKernel< - DeviceType, ConcatParam, - operators::ConcatKernel> { - public: - ConcatOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ConcatKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/conditional_block_op.cpp b/mobile/src/operators/conditional_block_op.cpp deleted file mode 100644 index 0f1e6f7556..0000000000 --- a/mobile/src/operators/conditional_block_op.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONDITIONAL_BLOCK_OP - -#include "operators/conditional_block_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void ConditionalBlockOp::InferShape() const {} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(conditional_block, ops::ConditionalBlockOp); -#endif - -#endif // CONDITIONAL_BLOCK_OP diff --git a/mobile/src/operators/conditional_block_op.h b/mobile/src/operators/conditional_block_op.h deleted file mode 100644 index 8a5dfa5634..0000000000 --- a/mobile/src/operators/conditional_block_op.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONDITIONAL_BLOCK_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/conditional_block_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(ConditionalBlock, ConditionalBlockParam, - ConditionalBlockKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp b/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp deleted file mode 100644 index 0ea8ac01c6..0000000000 --- a/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/controlflow/tensor_array_read_write_op.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef WRITE_TO_ARRAY_OP -template -void WriteToArrayOp::InferShape() const {} -#endif // WRITE_TO_ARRAY_OP - -#ifdef READ_FROM_ARRAY_OP -template -void ReadFromArrayOp::InferShape() const {} -#endif // READ_FROM_ARRAY_OP - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -#ifdef WRITE_TO_ARRAY_OP -REGISTER_OPERATOR_CPU(write_to_array, ops::WriteToArrayOp); -#endif // WRITE_TO_ARRAY_OP - -#ifdef READ_FROM_ARRAY_OP -REGISTER_OPERATOR_CPU(read_from_array, ops::ReadFromArrayOp); -#endif // READ_FROM_ARRAY_OP -#endif diff --git a/mobile/src/operators/controlflow/tensor_array_read_write_op.h b/mobile/src/operators/controlflow/tensor_array_read_write_op.h deleted file mode 100644 index 21d3ca10ef..0000000000 --- a/mobile/src/operators/controlflow/tensor_array_read_write_op.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/tensor_array_read_write_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef WRITE_TO_ARRAY_OP -DECLARE_OPERATOR(WriteToArray, WriteToArrayParam, WriteToArrayKernel); -#endif // WRITE_TO_ARRAY_OP - -#ifdef READ_FROM_ARRAY_OP -DECLARE_OPERATOR(ReadFromArray, ReadFromArrayParam, ReadFromArrayKernel); -#endif // WRITE_TO_ARRAY_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/controlflow/while_op.cpp b/mobile/src/operators/controlflow/while_op.cpp deleted file mode 100644 index 06eb7c5709..0000000000 --- a/mobile/src/operators/controlflow/while_op.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/controlflow/while_op.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef WHILE_OP -template -void WhileOp::InferShape() const { - // TODO(hjchen2) -} -#endif // WHILE_OP - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -#ifdef WHILE_OP -REGISTER_OPERATOR_CPU(while, ops::WhileOp); -#endif // WHILE_OP -#endif diff --git a/mobile/src/operators/controlflow/while_op.h b/mobile/src/operators/controlflow/while_op.h deleted file mode 100644 index 6f753a08ef..0000000000 --- a/mobile/src/operators/controlflow/while_op.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/while_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef WHILE_OP -DECLARE_OPERATOR(While, WhileParam, WhileKernel); -#endif // WHILE_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/conv_op.cpp b/mobile/src/operators/conv_op.cpp deleted file mode 100644 index 88c1262546..0000000000 --- a/mobile/src/operators/conv_op.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#include "operators/conv_op.h" -#include -#include "framework/op_proto_maker.h" -#include "framework/op_registry.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void ConvOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp); -#endif - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(conv2d, ops::ConvOp); -#endif - -#endif diff --git a/mobile/src/operators/conv_op.h b/mobile/src/operators/conv_op.h deleted file mode 100644 index f023e60e72..0000000000 --- a/mobile/src/operators/conv_op.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/conv_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class ConvOp : public framework::OperatorWithKernel< - DeviceType, ConvParam, - operators::ConvKernel> { - public: - ConvOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ConvKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - private: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/conv_transpose_op.cpp b/mobile/src/operators/conv_transpose_op.cpp deleted file mode 100755 index 522337284f..0000000000 --- a/mobile/src/operators/conv_transpose_op.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_TRANSPOSE_OP - -#include "operators/conv_transpose_op.h" - -namespace paddle_mobile { -namespace operators {} -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose); -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose); -#endif - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(conv2d_transpose, ops::ConvOpTranspose); -#endif - -#endif diff --git a/mobile/src/operators/conv_transpose_op.h b/mobile/src/operators/conv_transpose_op.h deleted file mode 100755 index ace1893311..0000000000 --- a/mobile/src/operators/conv_transpose_op.h +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_TRANSPOSE_OP - -#pragma once - -#include -#include - -#include "framework/operator.h" -#include "operators/kernel/conv_transpose_kernel.h" - -namespace paddle_mobile { -namespace operators { -template -class ConvOpTranspose : public framework::OperatorWithKernel< - DeviceType, ConvTransposeParam, - operators::ConvTransposeKernel> { - public: - ConvOpTranspose(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, ConvTransposeParam, - operators::ConvTransposeKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const { - auto input = this->param_.Input(); - auto in_dims = input->dims(); - - auto filter = this->param_.Filter(); - auto filter_dims = filter->dims(); - - std::vector strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - std::vector dilations = this->param_.Dilations(); - std::vector output_size = this->param_.OutputSize(); - - int groups = this->param_.Groups(); - - PADDLE_MOBILE_ENFORCE( - in_dims.size() == 4 || in_dims.size() == 5, - "ConvTransposeOp intput should be 4-D or 5-D tensor."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() == filter_dims.size(), - "ConvTransposeOp input dimension and filter dimension " - "should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() - strides.size() == 2U, - "ConvTransposeOp input dimension and strides dimension should " - "be consistent."); - PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), - "ConvTransposeOp paddings dimension and strides " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), - "ConvTransposeOp paddings dimension and dilations " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims[1] == filter_dims[0], - "In ConvTransposeOp, The number of input channels should " - "be equal to the number of filter's channels."); - - std::vector output_shape({in_dims[0], filter_dims[1] * groups}); - if (output_size.size() == 2) { - output_shape.push_back(output_size[0]); - output_shape.push_back(output_size[1]); - } else { - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - - 2 * paddings[i] + filter_extent); - } - } - - this->param_.Output()->Resize(framework::make_ddim(output_shape)); - } - - private: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/crf_op.cpp b/mobile/src/operators/crf_op.cpp deleted file mode 100644 index 4ab299ebf4..0000000000 --- a/mobile/src/operators/crf_op.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CRF_OP - -#include - -#include "common/enforce.h" -#include "operators/crf_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void CrfOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.InputEmission(), - "Input(Emission) should be not null."); - PADDLE_MOBILE_ENFORCE(this->param_.InputTransition(), - "Input(Transition) should be not null."); - PADDLE_MOBILE_ENFORCE(this->param_.outputVBP(), - "Input(ViterbiPath) should be not null."); - - auto emission_dims = this->param_.InputEmission()->dims(); - PADDLE_MOBILE_ENFORCE(emission_dims.size() == 2U, - "The Input(Emission) should be a 2-D tensor."); - PADDLE_MOBILE_ENFORCE(emission_dims[0], - "An empty mini-batch is not allowed."); - - this->param_.outputVBP()->Resize( - {this->param_.InputEmission()->dims()[0], 1}); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(crf_decoding, ops::CrfOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -#endif - -#endif diff --git a/mobile/src/operators/crf_op.h b/mobile/src/operators/crf_op.h deleted file mode 100644 index fb0fd90889..0000000000 --- a/mobile/src/operators/crf_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CRF_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/crf_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class CrfOp : public framework::OperatorWithKernel< - DeviceType, CrfParam, - operators::CrfKernel> { - public: - CrfOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::CrfKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/depthwise_conv_op.cpp b/mobile/src/operators/depthwise_conv_op.cpp deleted file mode 100644 index 5413af6ff7..0000000000 --- a/mobile/src/operators/depthwise_conv_op.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DEPTHWISECONV_OP - -#include "operators/depthwise_conv_op.h" -#include -#include "framework/op_proto_maker.h" -#include "framework/op_registry.h" -#include "operators/conv_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void DepthwiseConvOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(depthwise_conv2d, ops::DepthwiseConvOp); -#endif -#endif diff --git a/mobile/src/operators/depthwise_conv_op.h b/mobile/src/operators/depthwise_conv_op.h deleted file mode 100644 index d1cbeeab06..0000000000 --- a/mobile/src/operators/depthwise_conv_op.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DEPTHWISECONV_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/conv_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template -class DepthwiseConvOp : public framework::OperatorWithKernel< - DeviceType, ConvParam, - operators::ConvKernel> { - public: - DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ConvKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/dequantize_op.cpp b/mobile/src/operators/dequantize_op.cpp deleted file mode 100644 index 1c04b3a95f..0000000000 --- a/mobile/src/operators/dequantize_op.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DEQUANT_OP - -#include "operators/dequantize_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void DequantizeOp::InferShape() const { - const auto& input_dims = this->param_.input_->dims(); - this->param_.output_->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp); -#endif - -#endif // DEQUANT_OP diff --git a/mobile/src/operators/dequantize_op.h b/mobile/src/operators/dequantize_op.h deleted file mode 100644 index 81ab62bee8..0000000000 --- a/mobile/src/operators/dequantize_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DEQUANT_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/dequantize_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class DequantizeOp - : public framework::OperatorWithKernel, - DequantizeKernel> { - public: - DequantizeOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - DequantizeKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // DEQUANT_OP diff --git a/mobile/src/operators/detection_ops.cpp b/mobile/src/operators/detection_ops.cpp deleted file mode 100644 index 50df7229e1..0000000000 --- a/mobile/src/operators/detection_ops.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/detection_ops.h" -#include - -namespace paddle_mobile { -namespace operators { - -#ifdef ANCHOR_GENERATOR_OP -template -void AnchorGeneratorOp::InferShape() const { - const auto &input_dims = this->param_.input_->dims(); - // DLOG << "AnchorGenerator input dim =" << input_dims.size(); - PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); - const auto &anchor_sizes = this->param_.anchor_sizes_; - const auto &aspect_ratios = this->param_.aspect_ratios_; - - size_t num_anchors = aspect_ratios.size() * anchor_sizes.size(); - std::vector dim_vec(4); - dim_vec[0] = input_dims[2]; - dim_vec[1] = input_dims[3]; - dim_vec[2] = num_anchors; - dim_vec[3] = 4; - - this->param_.output_anchors_->Resize(framework::make_ddim(dim_vec)); - this->param_.output_variances_->Resize(framework::make_ddim(dim_vec)); -} -#endif - -#ifdef PROPOSAL_OP -template -void ProposalOp::InferShape() const { - this->param_.rpn_rois_->Resize(framework::make_ddim({-1, 4})); - this->param_.rpn_probs_->Resize(framework::make_ddim({-1, 1})); -} -#endif - -#ifdef PSROI_POOL_OP -template -void PSRoiPoolOp::InferShape() const { - const auto &rois_dims = this->param_.input_rois_->dims(); - const int pooled_height = this->param_.pooled_height_; - const int pooled_width = this->param_.pooled_width_; - const int output_channels = this->param_.output_channels_; - - auto out_dims = this->param_.input_x_->dims(); - out_dims[0] = rois_dims[0]; - out_dims[1] = - output_channels; // input_dims[1] / (pooled_height * pooled_width); - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - this->param_.output_->Resize(out_dims); -} -#endif - -#ifdef ROIALIGN_POOL_OP -template -void RoiAlignPoolOp::InferShape() const { - const auto &rois_dims = this->param_.input_rois_->dims(); - const int pooled_height = this->param_.pooled_height_; - const int pooled_width = this->param_.pooled_width_; - - auto out_dims = this->param_.input_x_->dims(); - out_dims[0] = rois_dims[0]; - // out_dims[1] = - // output_channels; // input_dims[1] / (pooled_height * pooled_width); - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - this->param_.output_->Resize(out_dims); -} -#endif - -#ifdef ROI_PERSPECTIVE_OP -template -void RoiPerspectiveOp::InferShape() const { - const auto &input_dims = this->param_.input_x_->dims(); - const auto &rois_dims = this->param_.input_rois_->dims(); - const int transformed_height = this->param_.transformed_height_; - const int transformed_width = this->param_.transformed_width_; - std::vector out_dims_v({rois_dims[0], // num_rois - input_dims[1], // channels - static_cast(transformed_height), - static_cast(transformed_width)}); - auto out_dims = framework::make_ddim(out_dims_v); - this->param_.output_->Resize(out_dims); - - std::vector mask_dims_v({rois_dims[0], // num_rois - 1, // channels - static_cast(transformed_height), - static_cast(transformed_width)}); - auto mask_dims = framework::make_ddim(mask_dims_v); - - std::vector matrix_dims_v({rois_dims[0], 9}); - auto matrix_dims = framework::make_ddim(matrix_dims_v); - this->param_.transform_Matrix_->Resize(matrix_dims); - this->param_.mask->Resize(mask_dims); -} -#endif - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -#ifdef ANCHOR_GENERATOR_OP -REGISTER_OPERATOR_CPU(anchor_generator, ops::AnchorGeneratorOp); -#endif -#ifdef PROPOSAL_OP -REGISTER_OPERATOR_CPU(generate_proposals, ops::ProposalOp); -#endif -#ifdef PSROI_POOL_OP -REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp); -#endif -#ifdef ROI_PERSPECTIVE_OP -REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp); -#endif -#endif - -#ifdef PADDLE_MOBILE_FPGA -#ifdef ANCHOR_GENERATOR_OP -REGISTER_OPERATOR_FPGA(anchor_generator, ops::AnchorGeneratorOp); -#endif -#ifdef PROPOSAL_OP -REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp); -#endif -#ifdef PSROI_POOL_OP -REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp); -#endif -#ifdef ROIALIGN_POOL_OP -REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp); -#endif - -#endif diff --git a/mobile/src/operators/detection_ops.h b/mobile/src/operators/detection_ops.h deleted file mode 100644 index 3b3a54dc4b..0000000000 --- a/mobile/src/operators/detection_ops.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/detection_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef ANCHOR_GENERATOR_OP -DECLARE_OPERATOR(AnchorGenerator, AnchorGeneratorParam, AnchorGeneratorKernel); -#endif - -#ifdef PROPOSAL_OP -DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel); -#endif - -#ifdef PSROI_POOL_OP -DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel); -#endif - -#ifdef ROIALIGN_POOL_OP -DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel); -#endif - -#ifdef ROI_PERSPECTIVE_OP -DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel); -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/dropout_op.cpp b/mobile/src/operators/dropout_op.cpp deleted file mode 100644 index c0dafa424e..0000000000 --- a/mobile/src/operators/dropout_op.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DROPOUT_OP -#include "operators/dropout_op.h" -namespace paddle_mobile { -namespace operators { - -template -void DropoutOp::InferShape() const { - auto input_dims = this->param_.InputX()->dims(); - this->param_.Out()->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(dropout, ops::DropoutOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp); -#endif - -#endif diff --git a/mobile/src/operators/dropout_op.h b/mobile/src/operators/dropout_op.h deleted file mode 100644 index 132b94af69..0000000000 --- a/mobile/src/operators/dropout_op.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DROPOUT_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/dropout_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class DropoutOp : public framework::OperatorWithKernel< - DeviceType, DropoutParam, - operators::DropoutKernel> { - public: - DropoutOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::DropoutKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/elementwise_add_op.cpp b/mobile/src/operators/elementwise_add_op.cpp deleted file mode 100644 index 1f198aeb03..0000000000 --- a/mobile/src/operators/elementwise_add_op.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEADD_OP - -#include "operators/elementwise_add_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void ElementwiseAddOp::InferShape() const { - auto x_dim = this->param_.InputX()->dims(); - this->param_.Out()->Resize(x_dim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp); -#endif - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp); -#endif - -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp); -#endif - -#endif diff --git a/mobile/src/operators/elementwise_add_op.h b/mobile/src/operators/elementwise_add_op.h deleted file mode 100644 index 7819765813..0000000000 --- a/mobile/src/operators/elementwise_add_op.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEADD_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "kernel/elementwise_add_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class ElementwiseAddOp : public framework::OperatorWithKernel< - DeviceType, ElementwiseAddParam, - operators::ElementwiseAddKernel> { - public: - ElementwiseAddOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, ElementwiseAddParam, - operators::ElementwiseAddKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/elementwise_mul_op.cpp b/mobile/src/operators/elementwise_mul_op.cpp deleted file mode 100644 index 48b2a4c282..0000000000 --- a/mobile/src/operators/elementwise_mul_op.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEMUL_OP - -#include "operators/elementwise_mul_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void ElementwiseMulOp::InferShape() const { - auto x_dim = this->param_.InputX()->dims(); - this->param_.Out()->Resize(x_dim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(elementwise_mul, ops::ElementwiseMulOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp); -#endif - -#endif diff --git a/mobile/src/operators/elementwise_mul_op.h b/mobile/src/operators/elementwise_mul_op.h deleted file mode 100644 index 53a90180b6..0000000000 --- a/mobile/src/operators/elementwise_mul_op.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEMUL_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "kernel/elementwise_mul_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class ElementwiseMulOp : public framework::OperatorWithKernel< - DeviceType, ElementwiseMulParam, - operators::ElementwiseMulKernel> { - public: - ElementwiseMulOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, ElementwiseMulParam, - operators::ElementwiseMulKernel>( - type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ElementwiseMulParam, - operators::ElementwiseMulKernel>::OperatorWithKernel; - void InferShape() const override; - - protected: -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/elementwise_sub_op.cpp b/mobile/src/operators/elementwise_sub_op.cpp deleted file mode 100644 index 6962e69a8d..0000000000 --- a/mobile/src/operators/elementwise_sub_op.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISESUB_OP - -#include "operators/elementwise_sub_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void ElementwiseSubOp::InferShape() const { - auto x_dim = this->param_.InputX()->dims(); - this->param_.Out()->Resize(x_dim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(elementwise_sub, ops::ElementwiseSubOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -#endif - -#endif diff --git a/mobile/src/operators/elementwise_sub_op.h b/mobile/src/operators/elementwise_sub_op.h deleted file mode 100644 index ce3b310ef3..0000000000 --- a/mobile/src/operators/elementwise_sub_op.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISESUB_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "kernel/elementwise_sub_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class ElementwiseSubOp : public framework::OperatorWithKernel< - DeviceType, ElementwiseSubParam, - operators::ElementwiseSubKernel> { - public: - ElementwiseSubOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, ElementwiseSubParam, - operators::ElementwiseSubKernel>( - type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ElementwiseSubParam, - operators::ElementwiseSubKernel>::OperatorWithKernel; - void InferShape() const override; - - protected: -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/exp_op.cpp b/mobile/src/operators/exp_op.cpp deleted file mode 100644 index 549108d72e..0000000000 --- a/mobile/src/operators/exp_op.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef EXP_OP -#include "exp_op.h" -namespace paddle_mobile { -namespace operators { - -template -void EXPOp::InferShape() const { - auto shape = this->param_.InputX()->dims(); - this->param_.Out()->Resize(shape); -} -} // namespace operators -} // namespace paddle_mobile -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(exp, ops::EXPOp); -#endif - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(exp, ops::EXPOp); -#endif - -#endif diff --git a/mobile/src/operators/exp_op.h b/mobile/src/operators/exp_op.h deleted file mode 100644 index 6f8cd099b7..0000000000 --- a/mobile/src/operators/exp_op.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/exp_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef EXP_OP -DECLARE_OPERATOR(EXP, EXPParam, EXPKernel); -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/expand_op.cpp b/mobile/src/operators/expand_op.cpp deleted file mode 100644 index e1d8b76fd6..0000000000 --- a/mobile/src/operators/expand_op.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef EXPAND_OP - -#include "operators/expand_op.h" -#include - -namespace paddle_mobile { -namespace operators { - -template -void ExpandOp::InferShape() const { - auto x_dim = this->param_.InputX()->dims(); - - int expand_size = this->param_.expand_times.size(); - int x_dims_size = x_dim.size(); - PADDLE_MOBILE_ENFORCE(expand_size == x_dims_size, - "The number of expand_times size must be qual to the " - "rank of Input(X). The number of expand_times size " - "must be qual to the rank of Input(X).") - - framework::DDim out_dims(this->param_.InputX()->dims()); - for (size_t i = 0; i < this->param_.expand_times.size(); ++i) { - out_dims[i] *= this->param_.expand_times[i]; - } - this->param_.Out()->Resize(out_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(expand, ops::ExpandOp); -#endif - -#endif diff --git a/mobile/src/operators/expand_op.h b/mobile/src/operators/expand_op.h deleted file mode 100644 index d504000079..0000000000 --- a/mobile/src/operators/expand_op.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef EXPAND_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/expand_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef EXPAND_OP -DECLARE_OPERATOR(Expand, ExpandParam, ExpandKernel); -#endif - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/feed_op.cpp b/mobile/src/operators/feed_op.cpp deleted file mode 100644 index ffd253073a..0000000000 --- a/mobile/src/operators/feed_op.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/feed_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void FeedOp::InferShape() const { - auto out_dims = this->param_.Out()->dims(); - out_dims[0] = this->param_.BatchSize(); - int col = this->param_.Col(); - auto input_dims = this->param_.InputX()->at(col).dims(); - this->param_.Out()->Resize(input_dims); - if (input_dims.size() == 4 || input_dims.size() == 2) { - this->param_.Out()->Resize(input_dims); - } else { - this->param_.Out()->Resize(out_dims); - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(feed, ops::FeedOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(feed, ops::FeedOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(feed, ops::FeedOp); -#endif diff --git a/mobile/src/operators/feed_op.h b/mobile/src/operators/feed_op.h deleted file mode 100644 index fda259b585..0000000000 --- a/mobile/src/operators/feed_op.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/feed_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using std::string; - -template -class FeedOp - : public framework::OperatorWithKernel, - FeedKernel> { - public: - FeedOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap attrs, - framework::Scope *scope) - - : framework::OperatorWithKernel, - FeedKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/fetch_op.cpp b/mobile/src/operators/fetch_op.cpp deleted file mode 100644 index 104e8214a0..0000000000 --- a/mobile/src/operators/fetch_op.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/fetch_op.h" -namespace paddle_mobile { -namespace operators { - -template -void FetchOp::InferShape() const { - int col = this->param_.Col(); - auto x_dims = this->param_.InputX()->dims(); - this->param_.Out()->at(col).Resize(x_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fetch, ops::FetchOp); -#endif - -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fetch, ops::FetchOp); -#endif diff --git a/mobile/src/operators/fetch_op.h b/mobile/src/operators/fetch_op.h deleted file mode 100644 index 72c8e1997f..0000000000 --- a/mobile/src/operators/fetch_op.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/fetch_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; - -template -class FetchOp - : public framework::OperatorWithKernel, - FetchKernel> { - public: - FetchOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - FetchKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/fill_constant_batch_size_like_op.cpp b/mobile/src/operators/fill_constant_batch_size_like_op.cpp deleted file mode 100644 index 848ab436f2..0000000000 --- a/mobile/src/operators/fill_constant_batch_size_like_op.cpp +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP - -#include "operators/fill_constant_batch_size_like_op.h" - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOp); -#endif - -#endif diff --git a/mobile/src/operators/fill_constant_batch_size_like_op.h b/mobile/src/operators/fill_constant_batch_size_like_op.h deleted file mode 100644 index dff76d85d1..0000000000 --- a/mobile/src/operators/fill_constant_batch_size_like_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP - -#pragma once - -#include -#include -#include "framework/data_type.h" -#include "framework/operator.h" -#include "framework/selected_rows.h" -#include "operators/math/math_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class FillConstantBatchSizeLikeOp : public framework::OperatorBase { - public: - FillConstantBatchSizeLikeOp(const std::string &type, - const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap attrs, - framework::Scope *scope) - : framework::OperatorBase(type, inputs, outputs, attrs, - scope), - param_(inputs, outputs, attrs, scope) {} - void RunImpl() { - auto data_type = - static_cast<_PaddleMobile__Framework__Proto__VarType__Type>( - param_.DataDtype()); - framework::Tensor *tensor = nullptr; - auto value = param_.Value(); - auto *outvar = param_.OutVar(); - - if (outvar->template IsType()) { - tensor = outvar->template GetMutable(); - } else if (outvar->template IsType()) { - tensor = outvar->template GetMutable() - ->mutable_value(); - } else { - PADDLE_MOBILE_THROW_EXCEPTION( - "fill constant batch size like op's output only" - "supports SelectedRows and LoDTensor"); - } - auto shape = param_.Shape(); - std::vector shape_int64(shape.size(), 0); - std::transform(shape.begin(), shape.end(), shape_int64.begin(), - [](int a) { return static_cast(a); }); - auto ddim = framework::make_ddim(shape_int64); - ddim[param_.OutputDimIdx()] = param_.Input()->dims()[param_.InputDimIdx()]; - tensor->Resize(ddim); - tensor->mutable_data(framework::ToTypeIndex(data_type)); - - math::SetConstant(tensor, value); - } - - void Init() {} - - void InferShape() const { - PADDLE_MOBILE_ENFORCE( - param_.Out() != nullptr, - "Output (Out) of fill_constant_batch_size_like op should not be null."); - - auto shape = param_.Shape(); - - std::vector shape_int64(shape.size(), 0); - std::transform(shape.begin(), shape.end(), shape_int64.begin(), - [](int a) { return static_cast(a); }); - DLOG << shape_int64; - auto ddim = framework::make_ddim(shape_int64); - ddim[param_.OutputDimIdx()] = param_.Input()->dims()[param_.InputDimIdx()]; - param_.Out()->Resize(ddim); - } - - protected: - FillConstantBatchSizeLikeParam param_; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fill_constant_op.cpp b/mobile/src/operators/fill_constant_op.cpp deleted file mode 100644 index 0c13c57ceb..0000000000 --- a/mobile/src/operators/fill_constant_op.cpp +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FILL_CONSTANT_OP - -#include "operators/fill_constant_op.h" - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp); -#endif - -#endif diff --git a/mobile/src/operators/fill_constant_op.h b/mobile/src/operators/fill_constant_op.h deleted file mode 100644 index 0a51f8494d..0000000000 --- a/mobile/src/operators/fill_constant_op.h +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FILL_CONSTANT_OP - -#pragma once - -#include -#include "framework/data_type.h" -#include "framework/operator.h" -#include "framework/selected_rows.h" -#include "operators/math/math_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class FillConstantOp : public framework::OperatorBase { - public: - FillConstantOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap attrs, framework::Scope *scope) - : framework::OperatorBase(type, inputs, outputs, attrs, - scope), - param_(inputs, outputs, attrs, scope) {} - void RunImpl() { - auto data_type = - static_cast<_PaddleMobile__Framework__Proto__VarType__Type>( - param_.DataDtype()); - framework::Tensor *tensor = nullptr; - auto value = param_.Value(); - auto *outvar = param_.OutVar(); - - if (outvar->template IsType()) { - tensor = outvar->template GetMutable(); - } else if (outvar->template IsType()) { - tensor = outvar->template GetMutable() - ->mutable_value(); - } else { - PADDLE_MOBILE_THROW_EXCEPTION( - "fill constant op's output only" - "supports SelectedRows and LoDTensor"); - } - tensor->Resize(framework::make_ddim(param_.Shape())); - tensor->mutable_data(framework::ToTypeIndex(data_type)); - - math::SetConstant(tensor, value); - } - - void Init() {} - - void InferShape() const { - PADDLE_MOBILE_ENFORCE( - param_.Out() != nullptr, - "Output (Out) of fill_constant op should not be null."); - framework::DDim ddim = framework::make_ddim(param_.Shape()); - param_.Out()->Resize(ddim); - } - - protected: - FillConstantParam param_; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/flatten2_op.cpp b/mobile/src/operators/flatten2_op.cpp deleted file mode 100644 index 78e933e278..0000000000 --- a/mobile/src/operators/flatten2_op.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FLATTEN2_OP -#include "operators/flatten2_op.h" -#include - -namespace paddle_mobile { -namespace operators { -template -void Flatten2Op::InferShape() const { - const auto* input = this->param_.InputX(); - auto* output = this->param_.Out(); - auto input_x_dims = input->dims(); - if (input->dims().size() == 4) { - PADDLE_MOBILE_ENFORCE(this->param_.Axis() == 1, - "flatten 2 only support axis == 1"); - if (this->param_.Axis() == 1) { - std::vector temp_output_dims(2); - temp_output_dims[0] = input->dims()[0]; - temp_output_dims[1] = - input->dims()[1] * input->dims()[2] * input->dims()[3]; - output->Resize(framework::make_ddim(temp_output_dims)); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(flatten2, ops::Flatten2Op); -#endif - -#endif diff --git a/mobile/src/operators/flatten2_op.h b/mobile/src/operators/flatten2_op.h deleted file mode 100644 index 9c08e9c335..0000000000 --- a/mobile/src/operators/flatten2_op.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FLATTEN2_OP - -#pragma once - -#include -#include - -#include "framework/operator.h" -#include "operators/kernel/flatten2_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(Flatten2, FlattenParam, Flatten2Kernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/flatten_op.cpp b/mobile/src/operators/flatten_op.cpp deleted file mode 100644 index 4e52485345..0000000000 --- a/mobile/src/operators/flatten_op.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FLATTEN_OP - -#include "operators/flatten_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void FlattenOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr, - "Input (X) of Flatten op should not be null."); - PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr, - "Output (Output) of Flatten op should not be null."); - - auto &axis = this->param_.Axis(); - PADDLE_MOBILE_ENFORCE(axis >= 0, - "The axis should be greater than or equal to 0."); - - auto &in_dims = this->param_.InputX()->dims(); - PADDLE_MOBILE_ENFORCE( - axis <= in_dims.size(), - "The axis should be less than or equal to input tensor's rank."); - - const auto &out_dims = GetOutputShape(axis, in_dims); - this->param_.Out()->Resize(framework::make_ddim(out_dims)); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(flatten, ops::FlattenOp); -REGISTER_OPERATOR_CPU(flatten2, ops::Flatten2Op); -#endif - -#endif // FLATTEN_OP diff --git a/mobile/src/operators/flatten_op.h b/mobile/src/operators/flatten_op.h deleted file mode 100644 index ef97994dc1..0000000000 --- a/mobile/src/operators/flatten_op.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FLATTEN_OP - -#pragma once - -#include -#include - -#include "framework/operator.h" -#include "operators/kernel/flatten_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -inline std::vector GetOutputShape(const int axis, - const framework::DDim &in_dims) { - int64_t outer = 1, inner = 1; - for (int i = 0; i < in_dims.size(); ++i) { - if (i < axis) { - outer *= in_dims[i]; - } else { - inner *= in_dims[i]; - } - } - std::vector out_shape(2); - out_shape[0] = static_cast(outer); - out_shape[1] = static_cast(inner); - return out_shape; -} - -template -class FlattenOp : public framework::OperatorWithKernel< - DeviceType, FlattenParam, - operators::FlattenKernel> { - public: - FlattenOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::FlattenKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -template -class Flatten2Op : public FlattenOp { - public: - Flatten2Op(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : FlattenOp(type, inputs, outputs, attrs, scope) {} -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_conv_add_bn_op.cpp b/mobile/src/operators/fusion_conv_add_bn_op.cpp deleted file mode 100644 index 27e3c04d62..0000000000 --- a/mobile/src/operators/fusion_conv_add_bn_op.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBN_OP - -#include "operators/fusion_conv_add_bn_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvAddBNOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_add_bn, ops::FusionConvAddBNMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_conv_add_bn_op.h b/mobile/src/operators/fusion_conv_add_bn_op.h deleted file mode 100644 index 0618f80512..0000000000 --- a/mobile/src/operators/fusion_conv_add_bn_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBN_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_add_bn_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionConvAddBNMatcher : public framework::FusionOpMatcher { - public: - FusionConvAddBNMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_BATCHNORM); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN; } -}; - -template -class FusionConvAddBNOp : public framework::OperatorWithKernel< - DeviceType, FusionConvAddBNParam, - operators::ConvAddBNKernel> { - public: - FusionConvAddBNOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionConvAddBNParam, - operators::ConvAddBNKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp b/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp deleted file mode 100644 index 4cf7e70112..0000000000 --- a/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBNRELU_OP - -#include "operators/fusion_conv_add_bn_relu_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvAddBNReluOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_add_bn_relu, - ops::FusionConvAddBNReluMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); -#endif -#endif diff --git a/mobile/src/operators/fusion_conv_add_bn_relu_op.h b/mobile/src/operators/fusion_conv_add_bn_relu_op.h deleted file mode 100644 index 9dd2fd406a..0000000000 --- a/mobile/src/operators/fusion_conv_add_bn_relu_op.h +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBNRELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_add_bn_relu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher { - public: - FusionConvAddBNReluMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; } -}; - -template -class FusionConvAddBNReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionConvAddBNReluParam, - operators::ConvAddBNReluKernel> { - public: - FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionConvAddBNReluParam, - operators::ConvAddBNReluKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_conv_add_op.cpp b/mobile/src/operators/fusion_conv_add_op.cpp deleted file mode 100644 index c611f1084f..0000000000 --- a/mobile/src/operators/fusion_conv_add_op.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP - -#include "operators/fusion_conv_add_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvAddOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp); -#endif - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_conv_add, ops::FusionConvAddOp); -#endif -#endif diff --git a/mobile/src/operators/fusion_conv_add_op.h b/mobile/src/operators/fusion_conv_add_op.h deleted file mode 100644 index 22ecab45e6..0000000000 --- a/mobile/src/operators/fusion_conv_add_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_add_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionConvAddMatcher : public framework::FusionOpMatcher { - public: - FusionConvAddMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD; } -}; - -template -class FusionConvAddOp : public framework::OperatorWithKernel< - DeviceType, FusionConvAddParam, - operators::ConvAddKernel> { - public: - FusionConvAddOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ConvAddKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_conv_add_relu_op.cpp b/mobile/src/operators/fusion_conv_add_relu_op.cpp deleted file mode 100644 index d827d845e1..0000000000 --- a/mobile/src/operators/fusion_conv_add_relu_op.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDRELU_OP - -#include "operators/fusion_conv_add_relu_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvAddReluOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_add_relu, ops::FusionConvAddReluOpMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp); -#endif -#endif diff --git a/mobile/src/operators/fusion_conv_add_relu_op.h b/mobile/src/operators/fusion_conv_add_relu_op.h deleted file mode 100644 index 7a1cfd1941..0000000000 --- a/mobile/src/operators/fusion_conv_add_relu_op.h +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDRELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_add_relu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher { - public: - FusionConvAddReluOpMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes); - } - std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; } -}; - -template -class FusionConvAddReluOp : public framework::OperatorWithKernel< - DeviceType, FusionConvAddReluParam, - operators::ConvAddReluKernel> { - public: - FusionConvAddReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionConvAddReluParam, - operators::ConvAddReluKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp b/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp deleted file mode 100644 index 759c0df8d4..0000000000 --- a/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNADDRELU_OP - -#include "operators/fusion_conv_bn_add_relu_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvBNAddReluOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu, - ops::FusionConvBNAddReluMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_conv_bn_add_relu_op.h b/mobile/src/operators/fusion_conv_bn_add_relu_op.h deleted file mode 100644 index 676d30ce26..0000000000 --- a/mobile/src/operators/fusion_conv_bn_add_relu_op.h +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNADDRELU_OP - -#pragma once - -#include -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_bn_add_relu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionConvBNAddReluMatcher : public framework::FusionOpMatcher { - public: - FusionConvBNAddReluMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}, - {"Y", "BNY"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_ADD_RELU; } - std::vector> NeedCheck() { - DLOG << " conv bn add relu check add X "; - return {{2, "Y"}, {2, "X"}}; - } -}; - -template -class FusionConvBNAddReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionConvBNAddReluParam, - operators::ConvBNAddReluKernel> { - public: - FusionConvBNAddReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionConvBNAddReluParam, - operators::ConvBNAddReluKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_conv_bn_op.cpp b/mobile/src/operators/fusion_conv_bn_op.cpp deleted file mode 100644 index 3c6fa5b1a3..0000000000 --- a/mobile/src/operators/fusion_conv_bn_op.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBN_OP - -#include "operators/fusion_conv_bn_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvBNOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_bn, ops::FusionConvBNMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_conv_bn_op.h b/mobile/src/operators/fusion_conv_bn_op.h deleted file mode 100644 index 385bb539fd..0000000000 --- a/mobile/src/operators/fusion_conv_bn_op.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBN_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_bn_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionConvBNMatcher : public framework::FusionOpMatcher { - public: - FusionConvBNMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_BATCHNORM); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_CONV_BN; } -}; - -template -class FusionConvBNOp : public framework::OperatorWithKernel< - DeviceType, FusionConvBNParam, - operators::ConvBNKernel> { - public: - FusionConvBNOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ConvBNKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_conv_bn_relu_op.cpp b/mobile/src/operators/fusion_conv_bn_relu_op.cpp deleted file mode 100644 index 4561ec7b93..0000000000 --- a/mobile/src/operators/fusion_conv_bn_relu_op.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNRELU_OP - -#include "operators/fusion_conv_bn_relu_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvBNReluOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_bn_relu, ops::FusionConvBNReluMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_conv_bn_relu, ops::FusionConvBNReluOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_conv_bn_relu_op.h b/mobile/src/operators/fusion_conv_bn_relu_op.h deleted file mode 100644 index 2f49df081c..0000000000 --- a/mobile/src/operators/fusion_conv_bn_relu_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNRELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_bn_relu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionConvBNReluMatcher : public framework::FusionOpMatcher { - public: - FusionConvBNReluMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; } -}; - -template -class FusionConvBNReluOp : public framework::OperatorWithKernel< - DeviceType, FusionConvBNReluParam, - operators::ConvBNReluKernel> { - public: - FusionConvBNReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionConvBNReluParam, - operators::ConvBNReluKernel>(type, inputs, outputs, - attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_conv_relu_op.cpp b/mobile/src/operators/fusion_conv_relu_op.cpp deleted file mode 100644 index d403ceae2f..0000000000 --- a/mobile/src/operators/fusion_conv_relu_op.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVRELU_OP - -#include "operators/fusion_conv_relu_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvReluOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_relu, ops::FusionConvReluMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_relu, ops::FusionConvReluOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_conv_relu, ops::FusionConvReluOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_conv_relu, ops::FusionConvReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_conv_relu_op.h b/mobile/src/operators/fusion_conv_relu_op.h deleted file mode 100644 index 6444b6b739..0000000000 --- a/mobile/src/operators/fusion_conv_relu_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVRELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_relu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -class FusionConvReluMatcher : public framework::FusionOpMatcher { - public: - FusionConvReluMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), {}, removed_nodes); - } - std::string Type() { return G_OP_TYPE_FUSION_CONV_RELU; } -}; - -template -class FusionConvReluOp : public framework::OperatorWithKernel< - DeviceType, FusionConvReluParam, - operators::ConvReluKernel> { - public: - FusionConvReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ConvReluKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_deconv_add_bn_op.cpp b/mobile/src/operators/fusion_deconv_add_bn_op.cpp deleted file mode 100644 index e83e29d2ea..0000000000 --- a/mobile/src/operators/fusion_deconv_add_bn_op.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDBN_OP - -#include "operators/fusion_deconv_add_bn_op.h" - -namespace paddle_mobile { -namespace operators {} -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher); -#ifdef PADDLE_MOBILE_CPU -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_deconv_add_bn_op.h b/mobile/src/operators/fusion_deconv_add_bn_op.h deleted file mode 100644 index 6185450441..0000000000 --- a/mobile/src/operators/fusion_deconv_add_bn_op.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_DECONVADDBN_OP -#pragma once -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/deconv_add_bn_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionDeconvAddBNMatcher : public framework::FusionOpMatcher { - public: - FusionDeconvAddBNMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_BATCHNORM); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}, - {"Y", "BNY"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN; } -}; - -template -class FusionDeconvAddBNOp : public framework::OperatorWithKernel< - DeviceType, FusionDeconvAddBNParam, - operators::DeconvAddBNKernel> { - public: - FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDeconvAddBNParam, - operators::DeconvAddBNKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const { - auto input = this->param_.Input(); - auto in_dims = input->dims(); - - auto filter = this->param_.Filter(); - auto filter_dims = filter->dims(); - - std::vector strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - std::vector dilations = this->param_.Dilations(); - - int groups = this->param_.Groups(); - - PADDLE_MOBILE_ENFORCE( - in_dims.size() == 4 || in_dims.size() == 5, - "ConvTransposeOp intput should be 4-D or 5-D tensor."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() == filter_dims.size(), - "ConvTransposeOp input dimension and filter dimension " - "should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() - strides.size() == 2U, - "ConvTransposeOp input dimension and strides dimension should " - "be consistent."); - PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), - "ConvTransposeOp paddings dimension and strides " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), - "ConvTransposeOp paddings dimension and dilations " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims[1] == filter_dims[0], - "In ConvTransposeOp, The number of input channels should " - "be equal to the number of filter's channels."); - - std::vector output_shape({in_dims[0], filter_dims[1] * groups}); - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - - 2 * paddings[i] + filter_extent); - } - this->param_.Output()->Resize(framework::make_ddim(output_shape)); - } - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_DECONV_ADD_BN_OP diff --git a/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp b/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp deleted file mode 100755 index 9f3ca09c3e..0000000000 --- a/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDBNRELU_OP - -#include "operators/fusion_deconv_add_bn_relu_op.h" - -namespace paddle_mobile { -namespace operators {} -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu, - ops::FusionDeconvAddBNReluMatcher); -#ifdef PADDLE_MOBILE_CPU -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_deconv_add_bn_relu_op.h b/mobile/src/operators/fusion_deconv_add_bn_relu_op.h deleted file mode 100644 index 1c6cfd7318..0000000000 --- a/mobile/src/operators/fusion_deconv_add_bn_relu_op.h +++ /dev/null @@ -1,118 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_DECONVADDBNRELU_OP -#pragma once -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/deconv_add_bn_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionDeconvAddBNReluMatcher : public framework::FusionOpMatcher { - public: - FusionDeconvAddBNReluMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}, - {"Y", "BNY"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; } -}; - -template -class FusionDeconvAddBNReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionDeconvAddBNReluParam, - operators::DeconvAddBNReluKernel> { - public: - FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDeconvAddBNReluParam, - operators::DeconvAddBNReluKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const { - auto input = this->param_.Input(); - auto in_dims = input->dims(); - - auto filter = this->param_.Filter(); - auto filter_dims = filter->dims(); - - std::vector strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - std::vector dilations = this->param_.Dilations(); - - int groups = this->param_.Groups(); - - PADDLE_MOBILE_ENFORCE( - in_dims.size() == 4 || in_dims.size() == 5, - "ConvTransposeOp intput should be 4-D or 5-D tensor."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() == filter_dims.size(), - "ConvTransposeOp input dimension and filter dimension " - "should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() - strides.size() == 2U, - "ConvTransposeOp input dimension and strides dimension should " - "be consistent."); - PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), - "ConvTransposeOp paddings dimension and strides " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), - "ConvTransposeOp paddings dimension and dilations " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims[1] == filter_dims[0], - "In ConvTransposeOp, The number of input channels should " - "be equal to the number of filter's channels."); - - std::vector output_shape({in_dims[0], filter_dims[1] * groups}); - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - - 2 * paddings[i] + filter_extent); - } - this->param_.Output()->Resize(framework::make_ddim(output_shape)); - } - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_DECONV_ADD_BN_RELU_OP diff --git a/mobile/src/operators/fusion_deconv_add_op.cpp b/mobile/src/operators/fusion_deconv_add_op.cpp deleted file mode 100644 index 717039cd3d..0000000000 --- a/mobile/src/operators/fusion_deconv_add_op.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADD_OP - -#include "operators/fusion_deconv_add_op.h" - -namespace paddle_mobile { -namespace operators {} -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_deconv_add, ops::FusionDeconvAddMatcher); -#ifdef PADDLE_MOBILE_CPU -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_deconv_add, ops::FusionDeconvAddOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_deconv_add_op.h b/mobile/src/operators/fusion_deconv_add_op.h deleted file mode 100644 index 406f81318a..0000000000 --- a/mobile/src/operators/fusion_deconv_add_op.h +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_DECONVADD_OP -#pragma once -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/deconv_add_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionDeconvAddMatcher : public framework::FusionOpMatcher { - public: - FusionDeconvAddMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD; } -}; - -template -class FusionDeconvAddOp : public framework::OperatorWithKernel< - DeviceType, FusionDeconvAddParam, - operators::DeconvAddKernel> { - public: - FusionDeconvAddOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDeconvAddParam, - operators::DeconvAddKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const { - auto input = this->param_.Input(); - auto in_dims = input->dims(); - - auto filter = this->param_.Filter(); - auto filter_dims = filter->dims(); - - std::vector strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - std::vector dilations = this->param_.Dilations(); - - int groups = this->param_.Groups(); - - PADDLE_MOBILE_ENFORCE( - in_dims.size() == 4 || in_dims.size() == 5, - "ConvTransposeOp intput should be 4-D or 5-D tensor."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() == filter_dims.size(), - "ConvTransposeOp input dimension and filter dimension " - "should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() - strides.size() == 2U, - "ConvTransposeOp input dimension and strides dimension should " - "be consistent."); - PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), - "ConvTransposeOp paddings dimension and strides " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), - "ConvTransposeOp paddings dimension and dilations " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims[1] == filter_dims[0], - "In ConvTransposeOp, The number of input channels should " - "be equal to the number of filter's channels."); - - std::vector output_shape({in_dims[0], filter_dims[1] * groups}); - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - - 2 * paddings[i] + filter_extent); - } - this->param_.Output()->Resize(framework::make_ddim(output_shape)); - } - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_DECONV_ADD_OP diff --git a/mobile/src/operators/fusion_deconv_add_relu_op.cpp b/mobile/src/operators/fusion_deconv_add_relu_op.cpp deleted file mode 100644 index a461bce2ef..0000000000 --- a/mobile/src/operators/fusion_deconv_add_relu_op.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDRELU_OP - -#include "operators/fusion_deconv_add_relu_op.h" - -namespace paddle_mobile { -namespace operators {} -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_deconv_add_relu, - ops::FusionDeconvAddReluMatcher); -#ifdef PADDLE_MOBILE_CPU -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_deconv_add_relu, ops::FusionDeconvAddReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_deconv_add_relu_op.h b/mobile/src/operators/fusion_deconv_add_relu_op.h deleted file mode 100644 index 735e126b03..0000000000 --- a/mobile/src/operators/fusion_deconv_add_relu_op.h +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_DECONVADDRELU_OP -#pragma once -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/deconv_add_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionDeconvAddReluMatcher : public framework::FusionOpMatcher { - public: - FusionDeconvAddReluMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_RELU; } -}; - -template -class FusionDeconvAddReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionDeconvAddReluParam, - operators::DeconvAddReluKernel> { - public: - FusionDeconvAddReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDeconvAddReluParam, - operators::DeconvAddReluKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const { - auto input = this->param_.Input(); - auto in_dims = input->dims(); - - auto filter = this->param_.Filter(); - auto filter_dims = filter->dims(); - - std::vector strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - std::vector dilations = this->param_.Dilations(); - - int groups = this->param_.Groups(); - - PADDLE_MOBILE_ENFORCE( - in_dims.size() == 4 || in_dims.size() == 5, - "ConvTransposeOp intput should be 4-D or 5-D tensor."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() == filter_dims.size(), - "ConvTransposeOp input dimension and filter dimension " - "should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() - strides.size() == 2U, - "ConvTransposeOp input dimension and strides dimension should " - "be consistent."); - PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), - "ConvTransposeOp paddings dimension and strides " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), - "ConvTransposeOp paddings dimension and dilations " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims[1] == filter_dims[0], - "In ConvTransposeOp, The number of input channels should " - "be equal to the number of filter's channels."); - - std::vector output_shape({in_dims[0], filter_dims[1] * groups}); - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - - 2 * paddings[i] + filter_extent); - } - this->param_.Output()->Resize(framework::make_ddim(output_shape)); - } - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_DECONV_ADD_RELU_OP diff --git a/mobile/src/operators/fusion_deconv_bn_relu_op.cpp b/mobile/src/operators/fusion_deconv_bn_relu_op.cpp deleted file mode 100644 index 207acd9380..0000000000 --- a/mobile/src/operators/fusion_deconv_bn_relu_op.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVBNRELU_OP - -#include "operators/fusion_deconv_bn_relu_op.h" - -namespace paddle_mobile { -namespace operators {} -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_deconv_bn_relu, ops::FusionDeconvBNReluMatcher); -#ifdef PADDLE_MOBILE_CPU -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_deconv_bn_relu, ops::FusionDeconvBNReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_deconv_bn_relu_op.h b/mobile/src/operators/fusion_deconv_bn_relu_op.h deleted file mode 100644 index 92bb97445d..0000000000 --- a/mobile/src/operators/fusion_deconv_bn_relu_op.h +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_DECONVBNRELU_OP -#pragma once -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/deconv_bn_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionDeconvBNReluMatcher : public framework::FusionOpMatcher { - public: - FusionDeconvBNReluMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); - node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DECONV_BN_RELU; } -}; - -template -class FusionDeconvBNReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionDeconvBNReluParam, - operators::DeconvBNReluKernel> { - public: - FusionDeconvBNReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDeconvBNReluParam, - operators::DeconvBNReluKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const { - auto input = this->param_.Input(); - auto in_dims = input->dims(); - - auto filter = this->param_.Filter(); - auto filter_dims = filter->dims(); - - std::vector strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - std::vector dilations = this->param_.Dilations(); - - int groups = this->param_.Groups(); - - PADDLE_MOBILE_ENFORCE( - in_dims.size() == 4 || in_dims.size() == 5, - "ConvTransposeOp intput should be 4-D or 5-D tensor."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() == filter_dims.size(), - "ConvTransposeOp input dimension and filter dimension " - "should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() - strides.size() == 2U, - "ConvTransposeOp input dimension and strides dimension should " - "be consistent."); - PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), - "ConvTransposeOp paddings dimension and strides " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), - "ConvTransposeOp paddings dimension and dilations " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims[1] == filter_dims[0], - "In ConvTransposeOp, The number of input channels should " - "be equal to the number of filter's channels."); - - std::vector output_shape({in_dims[0], filter_dims[1] * groups}); - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - - 2 * paddings[i] + filter_extent); - } - this->param_.Output()->Resize(framework::make_ddim(output_shape)); - } - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_DECONV_BN_RELU_OP diff --git a/mobile/src/operators/fusion_deconv_relu_op.cpp b/mobile/src/operators/fusion_deconv_relu_op.cpp deleted file mode 100644 index 7c48c4f14c..0000000000 --- a/mobile/src/operators/fusion_deconv_relu_op.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVRELU_OP - -#include "operators/fusion_deconv_relu_op.h" - -namespace paddle_mobile { -namespace operators {} -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -#endif - -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_deconv_relu_op.h b/mobile/src/operators/fusion_deconv_relu_op.h deleted file mode 100644 index c290a8da08..0000000000 --- a/mobile/src/operators/fusion_deconv_relu_op.h +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_DECONVRELU_OP -#pragma once -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/deconv_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionDeconvReluMatcher : public framework::FusionOpMatcher { - public: - FusionDeconvReluMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); - node_ > std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), {}, removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; } -}; - -template -class FusionDeconvReluOp : public framework::OperatorWithKernel< - DeviceType, FusionDeconvReluParam, - operators::DeconvReluKernel> { - public: - FusionDeconvReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDeconvReluParam, - operators::DeconvReluKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const { - auto input = this->param_.Input(); - auto in_dims = input->dims(); - - auto filter = this->param_.Filter(); - auto filter_dims = filter->dims(); - - std::vector strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - std::vector dilations = this->param_.Dilations(); - - int groups = this->param_.Groups(); - - PADDLE_MOBILE_ENFORCE( - in_dims.size() == 4 || in_dims.size() == 5, - "ConvTransposeOp intput should be 4-D or 5-D tensor."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() == filter_dims.size(), - "ConvTransposeOp input dimension and filter dimension " - "should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims.size() - strides.size() == 2U, - "ConvTransposeOp input dimension and strides dimension should " - "be consistent."); - PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), - "ConvTransposeOp paddings dimension and strides " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), - "ConvTransposeOp paddings dimension and dilations " - "dimension should be the same."); - PADDLE_MOBILE_ENFORCE( - in_dims[1] == filter_dims[0], - "In ConvTransposeOp, The number of input channels should " - "be equal to the number of filter's channels."); - - std::vector output_shape({in_dims[0], filter_dims[1] * groups}); - for (size_t i = 0; i < strides.size(); ++i) { - auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - - 2 * paddings[i] + filter_extent); - } - this->param_.Output()->Resize(framework::make_ddim(output_shape)); - } - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_FC_RELU_OP diff --git a/mobile/src/operators/fusion_dequant_add_bn_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_op.cpp deleted file mode 100644 index 4df50af22b..0000000000 --- a/mobile/src/operators/fusion_dequant_add_bn_op.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DEQUANT_ADD_BN_OP - -#include "operators/fusion_dequant_add_bn_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionDequantAddBNOp::InferShape() const { - const auto& input_dims = this->param_.input_->dims(); - this->param_.output_->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_dequant_add_bn, ops::FusionDequantAddBNMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_dequant_add_bn, ops::FusionDequantAddBNOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_dequant_add_bn_op.h b/mobile/src/operators/fusion_dequant_add_bn_op.h deleted file mode 100644 index b838b544ce..0000000000 --- a/mobile/src/operators/fusion_dequant_add_bn_op.h +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DEQUANT_ADD_BN_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/dequant_bn_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -class FusionDequantAddBNMatcher : public framework::FusionOpMatcher { - public: - FusionDequantAddBNMatcher() { - node_ = framework::Node(G_OP_TYPE_DEQUANTIZE); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_BATCHNORM); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "BNScale"}, - {"Mean", "BNMean"}, - {"Bias", "BNBias"}, - {"Variance", "BNVariance"}, - {"Y", "Out"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN; } -}; - -template -class FusionDequantAddBNOp - : public framework::OperatorWithKernel< - DeviceType, FusionDequantAddBNParam, - operators::FusionDequantAddBNKernel> { - public: - FusionDequantAddBNOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDequantAddBNParam, - operators::FusionDequantAddBNKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp deleted file mode 100644 index 80d9040afb..0000000000 --- a/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP - -#include "operators/fusion_dequant_add_bn_relu_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionDequantAddBNReluOp::InferShape() const { - const auto& input_dims = this->param_.input_->dims(); - this->param_.output_->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu, - ops::FusionDequantAddBNReluMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu, - ops::FusionDequantAddBNReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_op.h b/mobile/src/operators/fusion_dequant_add_bn_relu_op.h deleted file mode 100644 index e2762923c5..0000000000 --- a/mobile/src/operators/fusion_dequant_add_bn_relu_op.h +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/dequant_bn_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher { - public: - FusionDequantAddBNReluMatcher() { - node_ = framework::Node(G_OP_TYPE_DEQUANTIZE); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "BNScale"}, - {"Mean", "BNMean"}, - {"Bias", "BNBias"}, - {"Variance", "BNVariance"}, - {"Y", "Out"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU; } -}; - -template -class FusionDequantAddBNReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionDequantAddBNParam, - operators::FusionDequantAddBNReluKernel> { - public: - FusionDequantAddBNReluOp(const std::string &type, - const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDequantAddBNParam, - operators::FusionDequantAddBNReluKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp deleted file mode 100644 index 82eacd7f47..0000000000 --- a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/fusion_dequant_add_bn_relu_quant_op.h" - -#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP -namespace paddle_mobile { -namespace operators { - -template -void FusionDequantAddBNReluQuantOp::InferShape() const { - const auto& input_dims = this->param_.input_->dims(); - this->param_.output_->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant, - ops::FusionDequantAddBNReluQuantMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu_quant, - ops::FusionDequantAddBNReluQuantOp); -#endif -#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP - -#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP -namespace paddle_mobile { -namespace operators { - -template -void FusionDequantAddBNQuantOp::InferShape() const { - const auto& input_dims = this->param_.input_->dims(); - this->param_.output_->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_quant, - ops::FusionDequantAddBNQuantMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_quant, - ops::FusionDequantAddBNQuantOp); -#endif - -#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h b/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h deleted file mode 100644 index 6caa8daeb3..0000000000 --- a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/dequant_bn_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP -class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher { - public: - FusionDequantAddBNReluQuantMatcher() { - node_ = framework::Node(G_OP_TYPE_DEQUANTIZE); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_RELU) > - std::make_shared(G_OP_TYPE_QUANTIZE); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "BNScale"}, - {"Mean", "BNMean"}, - {"Bias", "BNBias"}, - {"Variance", "BNVariance"}, - {"Y", "Out"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT; } -}; - -template -class FusionDequantAddBNReluQuantOp - : public framework::OperatorWithKernel< - DeviceType, FusionDequantAddBNReluQuantParam, - operators::FusionDequantAddBNReluQuantKernel> { - public: - FusionDequantAddBNReluQuantOp(const std::string &type, - const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDequantAddBNReluQuantParam, - operators::FusionDequantAddBNReluQuantKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; -#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP - -#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP -class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher { - public: - FusionDequantAddBNQuantMatcher() { - node_ = framework::Node(G_OP_TYPE_DEQUANTIZE); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_QUANTIZE); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_BATCHNORM, - {{"Scale", "BNScale"}, - {"Mean", "BNMean"}, - {"Bias", "BNBias"}, - {"Variance", "BNVariance"}, - {"Y", "Out"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT; } -}; - -template -class FusionDequantAddBNQuantOp - : public framework::OperatorWithKernel< - DeviceType, FusionDequantAddBNQuantParam, - operators::FusionDequantAddBNQuantKernel> { - public: - FusionDequantAddBNQuantOp(const std::string &type, - const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDequantAddBNQuantParam, - operators::FusionDequantAddBNQuantKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; -#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/fusion_dequant_bn_op.cpp b/mobile/src/operators/fusion_dequant_bn_op.cpp deleted file mode 100644 index 3c944c0158..0000000000 --- a/mobile/src/operators/fusion_dequant_bn_op.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/fusion_dequant_bn_op.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef FUSION_DEQUANT_BN_OP -template -void FusionDequantBNOp::InferShape() const { - const auto& input_dims = this->param_.input_->dims(); - this->param_.output_->Resize(input_dims); -} -#endif // FUSION_DEQUANT_BN_OP - -#ifdef FUSION_DEQUANT_BN_RELU_OP -template -void FusionDequantBNReluOp::InferShape() const { - const auto& input_dims = this->param_.input_->dims(); - this->param_.output_->Resize(input_dims); -} -#endif // FUSION_DEQUANT_BN_RELU_OP - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef FUSION_DEQUANT_BN_OP -REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher); -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp); -#endif // PADDLE_MOBILE_CPU -#endif // FUSION_DEQUANT_BN_OP - -#ifdef FUSION_DEQUANT_BN_RELU_OP -REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu, - ops::FusionDequantBNReluMatcher); -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp); -#endif // PADDLE_MOBILE_CPU -#endif // FUSION_DEQUANT_BN_RELU_OP diff --git a/mobile/src/operators/fusion_dequant_bn_op.h b/mobile/src/operators/fusion_dequant_bn_op.h deleted file mode 100644 index ac2237b77a..0000000000 --- a/mobile/src/operators/fusion_dequant_bn_op.h +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/dequant_bn_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP) -class FusionDequantBNMatcher : public framework::FusionOpMatcher { - public: - FusionDequantBNMatcher() { - node_ = framework::Node(G_OP_TYPE_DEQUANTIZE); - node_ > std::make_shared(G_OP_TYPE_BATCHNORM); - } - - virtual void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_BATCHNORM, - {{"Scale", "BNScale"}, - {"Mean", "BNMean"}, - {"Bias", "BNBias"}, - {"Variance", "BNVariance"}, - {"Y", "Out"}}}}, - removed_nodes); - } - - std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; } -}; -#endif // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP - -#ifdef FUSION_DEQUANT_BN_OP -template -class FusionDequantBNOp : public framework::OperatorWithKernel< - DeviceType, FusionDequantBNParam, - operators::FusionDequantBNKernel> { - public: - FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDequantBNParam, - operators::FusionDequantBNKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; -#endif // FUSION_DEQUANT_BN_OP - -#ifdef FUSION_DEQUANT_BN_RELU_OP -class FusionDequantBNReluMatcher : public FusionDequantBNMatcher { - public: - FusionDequantBNReluMatcher() : FusionDequantBNMatcher() { - node_ > std::make_shared(G_OP_TYPE_RELU); - } - - virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; } -}; - -template -class FusionDequantBNReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionDequantBNParam, - operators::FusionDequantBNReluKernel> { - public: - FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDequantBNParam, - operators::FusionDequantBNReluKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; -}; -#endif // FUSION_DEQUANT_BN_RELU_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/fusion_dequant_bn_relu_op.h b/mobile/src/operators/fusion_dequant_bn_relu_op.h deleted file mode 100644 index be3b5293a3..0000000000 --- a/mobile/src/operators/fusion_dequant_bn_relu_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DEQUANT_BN_RELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/dequant_bn_relu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -class FusionDequantBNReluMatcher : public framework::FusionOpMatcher { - public: - FusionDequantBNReluMatcher() { - node_ = framework::Node(G_OP_TYPE_DEQUANTIZE); - node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_BATCHNORM, - {{"Scale", "BNScale"}, - {"Mean", "BNMean"}, - {"Bias", "BNBias"}, - {"Variance", "BNVariance"}, - {"Y", "Out"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; } -}; - -template -class FusionDequantBNReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionDequantBNReluParam, - operators::FusionDequantBNReluKernel> { - public: - FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDequantBNReluParam, - operators::FusionDequantBNReluKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp b/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp deleted file mode 100644 index d4c04f67fc..0000000000 --- a/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DWCONVBNRELU_OP - -#include "operators/fusion_dwconv_bn_relu_op.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionDWConvBNReluOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - dilations[i], paddings[i], - strides[i])); - } - - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -#endif - -#endif diff --git a/mobile/src/operators/fusion_dwconv_bn_relu_op.h b/mobile/src/operators/fusion_dwconv_bn_relu_op.h deleted file mode 100644 index 0fb2e5c70c..0000000000 --- a/mobile/src/operators/fusion_dwconv_bn_relu_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DWCONVBNRELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/dwconv_bn_relu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher { - public: - FusionDWConvBNReluMatcher() { - node_ = framework::Node(G_OP_TYPE_DEPTHWISE_CONV); - node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_BATCHNORM, - {{"Scale", "Scale"}, - {"Mean", "Mean"}, - {"Bias", "Bias"}, - {"Variance", "Variance"}}}}, - removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_DWCONV_BN_RELU; } -}; - -template -class FusionDWConvBNReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionDWConvBNReluParam, - operators::DWConvBNReluKernel> { - public: - FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionDWConvBNReluParam, - operators::DWConvBNReluKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_elementwise_add_relu_op.cpp b/mobile/src/operators/fusion_elementwise_add_relu_op.cpp deleted file mode 100644 index def932a589..0000000000 --- a/mobile/src/operators/fusion_elementwise_add_relu_op.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_ELEMENTWISEADDRELU_OP - -#include "operators/fusion_elementwise_add_relu_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionElementwiseAddReluOp::InferShape() const { - auto x_dim = this->param_.InputX()->dims(); - this->param_.Out()->Resize(x_dim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu, - ops::FusioneElementwiseAddReluMatcher); - -#ifdef PADDLE_MOBILE_CPU -// REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu, -// ops::FusionElementwiseAddReluOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu, - ops::FusionElementwiseAddReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_elementwise_add_relu_op.h b/mobile/src/operators/fusion_elementwise_add_relu_op.h deleted file mode 100644 index c90d4e041e..0000000000 --- a/mobile/src/operators/fusion_elementwise_add_relu_op.h +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_ELEMENTWISEADDRELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/elementwise_add_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusioneElementwiseAddReluMatcher : public framework::FusionOpMatcher { - public: - FusioneElementwiseAddReluMatcher() { - node_ = framework::Node(G_OP_TYPE_ELEMENTWISE_ADD); - node_ > std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), {}, removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; } -}; - -template -class FusionElementwiseAddReluOp - : public framework::OperatorWithKernel< - DeviceType, ElementwiseAddReluParam, - operators::ElementwiseAddReluKernel> { - public: - FusionElementwiseAddReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, ElementwiseAddReluParam, - operators::ElementwiseAddReluKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/fusion_fc_op.cpp b/mobile/src/operators/fusion_fc_op.cpp deleted file mode 100644 index 0e6bb28ea2..0000000000 --- a/mobile/src/operators/fusion_fc_op.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_FC_OP - -#include "operators/fusion_fc_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionFcOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - auto y_dims = this->param_.InputY()->dims(); - int x_num_col_dims = this->param_.XNumColDims(); - int y_num_col_dims = this->param_.YNumColDims(); - - assert(x_dims.size() > x_num_col_dims); - assert(y_dims.size() > y_num_col_dims); - - /// (1,2,3,4) , x_num_col_dims = 2 -> (2,12) - auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); - auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); - - assert(x_mat_dims[1] == y_mat_dims[0]); - - std::vector output_dims; - output_dims.reserve( - static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); - - for (int i = 0; i < x_num_col_dims; ++i) { - output_dims.push_back(x_dims[i]); - } - - for (int i = y_num_col_dims; i < y_dims.size(); ++i) { - output_dims.push_back(y_dims[i]); - } - - framework::DDim ddim = framework::make_ddim(output_dims); - this->param_.Out()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp); -#endif - -#endif // FUSION_FC_OP diff --git a/mobile/src/operators/fusion_fc_op.h b/mobile/src/operators/fusion_fc_op.h deleted file mode 100644 index a88add4584..0000000000 --- a/mobile/src/operators/fusion_fc_op.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_FC_OP - -#pragma once - -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/fusion_fc_kernel.h" - -namespace paddle_mobile { -namespace operators { - -class FusionFcMatcher : public framework::FusionOpMatcher { - public: - FusionFcMatcher() { - node_ = framework::Node(G_OP_TYPE_MUL); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FC; } -}; - -template -class FusionFcOp : public framework::OperatorWithKernel< - DeviceType, FusionFcParam, - operators::FusionFcKernel> { - public: - FusionFcOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::FusionFcKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_FC_OP diff --git a/mobile/src/operators/fusion_fc_relu_op.cpp b/mobile/src/operators/fusion_fc_relu_op.cpp deleted file mode 100644 index f47b220e36..0000000000 --- a/mobile/src/operators/fusion_fc_relu_op.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_FCRELU_OP - -#include "operators/fusion_fc_relu_op.h" -namespace paddle_mobile { -namespace operators { - -template -void FusionFcReluOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - auto y_dims = this->param_.InputY()->dims(); - int x_num_col_dims = this->param_.XNumColDims(); - int y_num_col_dims = this->param_.YNumColDims(); - - assert(x_dims.size() > x_num_col_dims); - assert(y_dims.size() > y_num_col_dims); - - /// (1,2,3,4) , x_num_col_dims = 2 -> (2,12) - auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); - auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); - - assert(x_mat_dims[1] == y_mat_dims[0]); - - std::vector output_dims; - output_dims.reserve( - static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); - - for (int i = 0; i < x_num_col_dims; ++i) { - output_dims.push_back(x_dims[i]); - } - - for (int i = y_num_col_dims; i < y_dims.size(); ++i) { - output_dims.push_back(y_dims[i]); - } - - framework::DDim ddim = framework::make_ddim(output_dims); - this->param_.Out()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_fc_relu_op.h b/mobile/src/operators/fusion_fc_relu_op.h deleted file mode 100644 index 253335c8f2..0000000000 --- a/mobile/src/operators/fusion_fc_relu_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_FCRELU_OP -#pragma once -#include -#include - -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/fc_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -using std::vector; -class FusionFcReluMatcher : public framework::FusionOpMatcher { - public: - FusionFcReluMatcher() { - node_ = framework::Node(G_OP_TYPE_MUL); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes); - } - - std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; } -}; - -template -class FusionFcReluOp : public framework::OperatorWithKernel< - DeviceType, FusionFcReluParam, - operators::FusionFcReluKernel> { - public: - FusionFcReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionFcReluParam, - operators::FusionFcReluKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_FC_RELU_OP diff --git a/mobile/src/operators/fusion_instancenorm_relu_op.cpp b/mobile/src/operators/fusion_instancenorm_relu_op.cpp deleted file mode 100644 index f6299fa72d..0000000000 --- a/mobile/src/operators/fusion_instancenorm_relu_op.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_INSTANCENORM_RELU_OP - -#include "operators/fusion_instancenorm_relu_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionInstanceNormReluOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - this->param_.Out()->Resize(x_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_instancenorm_relu, - ops::FusionInstanceNormReluMatcher); - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(fusion_instancenorm_relu, ops::FusionInstanceNormReluOp); -#endif - -#endif diff --git a/mobile/src/operators/fusion_instancenorm_relu_op.h b/mobile/src/operators/fusion_instancenorm_relu_op.h deleted file mode 100644 index 91551e6558..0000000000 --- a/mobile/src/operators/fusion_instancenorm_relu_op.h +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_INSTANCENORM_RELU_OP - -#pragma once - -#include -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/instancenorm_relu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher { - public: - FusionInstanceNormReluMatcher() { - node_ = framework::Node(G_OP_TYPE_INSTANCENORM); - node_ > std::make_shared(G_OP_TYPE_RELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), {}, removed_nodes); - } - std::string Type() { return G_OP_TYPE_FUSION_INSTANCENORM_RELU; } -}; - -template -class FusionInstanceNormReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionInstanceNormReluParam, - operators::InstanceNormReluKernel> { - public: - FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionInstanceNormReluParam, - operators::InstanceNormReluKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/grid_sampler_op.cpp b/mobile/src/operators/grid_sampler_op.cpp deleted file mode 100644 index 90809f1d4c..0000000000 --- a/mobile/src/operators/grid_sampler_op.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRID_SAMPLER_OP - -#include "operators/grid_sampler_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void GridSamplerOp::InferShape() const { - auto x_dim = this->param_.InputX()->dims(); - this->param_.Output()->Resize(x_dim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(grid_sampler, ops::GridSamplerOp); -#endif - -#endif diff --git a/mobile/src/operators/grid_sampler_op.h b/mobile/src/operators/grid_sampler_op.h deleted file mode 100644 index 9d142b9d47..0000000000 --- a/mobile/src/operators/grid_sampler_op.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRID_SAMPLER_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/grid_sampler_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef GRID_SAMPLER_OP -DECLARE_OPERATOR(GridSampler, GridSamplerParam, GridSamplerKernel); -#endif - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/gru_op.cpp b/mobile/src/operators/gru_op.cpp deleted file mode 100644 index db0936d00c..0000000000 --- a/mobile/src/operators/gru_op.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_OP - -#include "operators/gru_op.h" -#include -#include "common/enforce.h" - -namespace paddle_mobile { -namespace operators { - -template -void GruOp::InferShape() const { - auto input_dims = this->param_.InputInput()->dims(); - auto weight_dims = this->param_.InputWeight()->dims(); - int input_size = input_dims[1]; - int frame_size = weight_dims[0]; - PADDLE_MOBILE_ENFORCE( - (input_size == frame_size * 3), - "The input_size must be 3 times of frame_size in GRUOp."); - PADDLE_MOBILE_ENFORCE( - (weight_dims[1] == frame_size * 3), - "The shape of Weight matrix must be [frame_size, frame_size * 3]."); - if (this->param_.InputH0()) { - auto h0_dims = this->param_.InputH0()->dims(); - PADDLE_MOBILE_ENFORCE((h0_dims[1] == frame_size), - "The width of H0 must be equal to frame_size."); - } - if (this->param_.InputBias()) { - auto bias_dims = this->param_.InputBias()->dims(); - int bias_height = bias_dims[0]; - int bias_width = bias_dims[1]; - PADDLE_MOBILE_ENFORCE((bias_height == 1), - "The shape of Bias must be [1, frame_size * 3]."); - PADDLE_MOBILE_ENFORCE((bias_width == frame_size * 3), - "The shape of Bias must be [1, frame_size * 3]."); - } - this->param_.OutBatchGate()->Resize(input_dims); - this->param_.OutBatchResetHiddenPrev()->Resize({input_dims[0], frame_size}); - this->param_.OutBatchHidden()->Resize({input_dims[0], frame_size}); - this->param_.OutHidden()->Resize({input_dims[0], frame_size}); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(gru, ops::GruOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -#endif - -#endif diff --git a/mobile/src/operators/gru_op.h b/mobile/src/operators/gru_op.h deleted file mode 100644 index 80bbd7c222..0000000000 --- a/mobile/src/operators/gru_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/gru_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class GruOp : public framework::OperatorWithKernel< - DeviceType, GruParam, - operators::GruKernel> { - public: - GruOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::GruKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/gru_unit_op.cpp b/mobile/src/operators/gru_unit_op.cpp deleted file mode 100644 index 5dd1cd3dd3..0000000000 --- a/mobile/src/operators/gru_unit_op.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_UNIT_OP - -#include "operators/gru_unit_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void GruUnitOp::InferShape() const { - auto input_dims = this->param_.InputInput()->dims(); - auto hidden_prev_dims = this->param_.InputHiddenPrev()->dims(); - auto weight_dims = this->param_.InputWeight()->dims(); - int batch_size = input_dims[0]; - int input_size = input_dims[1]; - int frame_size = hidden_prev_dims[1]; - int weight_height = weight_dims[0]; - int weight_width = weight_dims[1]; - PADDLE_MOBILE_ENFORCE( - (input_size == frame_size * 3), - "The input_size must be 3 times of frame_size in GRUUnitOp."); - PADDLE_MOBILE_ENFORCE( - (weight_height == frame_size), - "The shape of Weight matrix must be [frame_size, frame_size * 3]."); - PADDLE_MOBILE_ENFORCE( - (weight_width == frame_size * 3), - "The shape of Weight matrix must be [frame_size, frame_size * 3]."); - if (this->param_.InputBias()) { - auto bias_dims = this->param_.InputBias()->dims(); - int bias_height = bias_dims[0]; - int bias_width = bias_dims[1]; - PADDLE_MOBILE_ENFORCE((bias_height == 1), - "The shape of Bias must be [1, frame_size * 3]."); - PADDLE_MOBILE_ENFORCE((bias_width == frame_size * 3), - "The shape of Bias must be [1, frame_size * 3]."); - } - this->param_.OutGate()->Resize({batch_size, frame_size * 3}); - this->param_.OutResetHiddenPrev()->Resize({batch_size, frame_size}); - this->param_.OutHidden()->Resize({batch_size, frame_size}); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(gru_unit, ops::GruUnitOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -#endif - -#ifdef PADDLE_MOBILE_CL -#endif - -#endif diff --git a/mobile/src/operators/gru_unit_op.h b/mobile/src/operators/gru_unit_op.h deleted file mode 100644 index 8821212bfa..0000000000 --- a/mobile/src/operators/gru_unit_op.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_UNIT_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/gru_unit_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class GruUnitOp : public framework::OperatorWithKernel< - DeviceType, GruUnitParam, - operators::GruUnitKernel> { - public: - GruUnitOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::GruUnitKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/im2sequence_op.cpp b/mobile/src/operators/im2sequence_op.cpp deleted file mode 100644 index 75a3c8c350..0000000000 --- a/mobile/src/operators/im2sequence_op.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef IM2SEQUENCE_OP - -#include "operators/im2sequence_op.h" -#include - -namespace paddle_mobile { -namespace operators { - -int Im2SequenceOutputSize(int input_size, int kernel, int padding_1, - int padding_2, int stride) { - int output_size = - 1 + (padding_1 + padding_2 + input_size - kernel + stride - 1) / stride; - return output_size; -} - -template -void Im2SequenceOp::InferShape() const { - auto in_x_dims = this->param_.Input()->dims(); - const std::vector &kernels = this->param_.Kernels(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i], - paddings[i], paddings[i + 2], - strides[i])); - } - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp); -#endif - -#endif // IM2SEQUENCE_OP diff --git a/mobile/src/operators/im2sequence_op.h b/mobile/src/operators/im2sequence_op.h deleted file mode 100644 index 4361380b8f..0000000000 --- a/mobile/src/operators/im2sequence_op.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef IM2SEQUENCE_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/im2sequence_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class Im2SequenceOp : public framework::OperatorWithKernel< - DeviceType, Im2SequenceParam, - operators::Im2SequenceKernel> { - public: - Im2SequenceOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, Im2SequenceParam, - operators::Im2SequenceKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const override; - - private: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/increment_op.cpp b/mobile/src/operators/increment_op.cpp deleted file mode 100644 index 7a04ae9b77..0000000000 --- a/mobile/src/operators/increment_op.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INCREMENT_OP - -#include "operators/increment_op.h" -#include "framework/op_proto_maker.h" -#include "framework/op_registry.h" - -namespace paddle_mobile { -namespace operators { - -template -void IncrementOp::InferShape() const { - auto input = this->param_.InputX(); - auto out = this->param_.Out(); - PADDLE_MOBILE_ENFORCE(input->numel() == 1, "input's numel should be 1"); - out->Resize(input->dims()); - if (std::is_same, Dtype>::value) { - out->set_lod(input->lod()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(increment, ops::IncrementOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -#endif - -#ifdef PADDLE_MOBILE_CL -#endif - -#endif diff --git a/mobile/src/operators/increment_op.h b/mobile/src/operators/increment_op.h deleted file mode 100644 index e0455b9113..0000000000 --- a/mobile/src/operators/increment_op.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INCREMENT_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/increment_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class IncrementOp - : public framework::OperatorWithKernel, - IncrementKernel> { - public: - IncrementOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - IncrementKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/instancenorm_op.cpp b/mobile/src/operators/instancenorm_op.cpp deleted file mode 100644 index 42af75ca21..0000000000 --- a/mobile/src/operators/instancenorm_op.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INSTANCENORM_OP - -#include "operators/instancenorm_op.h" -#include "framework/op_proto_maker.h" -#include "framework/op_registry.h" - -namespace paddle_mobile { -namespace operators { - -template -void InstanceNormOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - this->param_.OutputY()->Resize(x_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(instance_norm, ops::InstanceNormOp); -#endif - -#endif diff --git a/mobile/src/operators/instancenorm_op.h b/mobile/src/operators/instancenorm_op.h deleted file mode 100644 index 0047ce47ad..0000000000 --- a/mobile/src/operators/instancenorm_op.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INSTANCENORM_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/instancenorm_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class InstanceNormOp - : public framework::OperatorWithKernel, - InstanceNormKernel> { - public: - InstanceNormOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - InstanceNormKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/is_empty_op.cpp b/mobile/src/operators/is_empty_op.cpp deleted file mode 100644 index e3d71c8427..0000000000 --- a/mobile/src/operators/is_empty_op.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef IS_EMPTY_OP - -#include "operators/is_empty_op.h" -#include "framework/op_proto_maker.h" -#include "framework/op_registry.h" - -namespace paddle_mobile { -namespace operators { - -template -void IsEmptyOp::InferShape() const { - auto out = this->param_.Out(); - out->Resize({1}); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(is_empty, ops::IsEmptyOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -#endif - -#ifdef PADDLE_MOBILE_CL -#endif - -#endif diff --git a/mobile/src/operators/is_empty_op.h b/mobile/src/operators/is_empty_op.h deleted file mode 100644 index 1f31f25796..0000000000 --- a/mobile/src/operators/is_empty_op.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef IS_EMPTY_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/is_empty_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class IsEmptyOp - : public framework::OperatorWithKernel, - IsEmptyKernel> { - public: - IsEmptyOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - IsEmptyKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/activation_kernel.h b/mobile/src/operators/kernel/activation_kernel.h deleted file mode 100644 index b27691d521..0000000000 --- a/mobile/src/operators/kernel/activation_kernel.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef RELU_OP -DECLARE_KERNEL(Relu, ReluParam); -DECLARE_KERNEL(Relu6, Relu6Param); -#endif - -#ifdef SIGMOID_OP -DECLARE_KERNEL(Sigmoid, SigmoidParam); -#endif - -#ifdef TANH_OP -DECLARE_KERNEL(Tanh, TanhParam); -#endif - -#ifdef LOG_OP -DECLARE_KERNEL(Log, ReluParam); -#endif - -#ifdef LEAKY_RELU_OP -DECLARE_KERNEL(LeakyRelu, LeakyReluParam); -#endif -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/activation_kernel.cpp b/mobile/src/operators/kernel/arm/activation_kernel.cpp deleted file mode 100644 index be8ebc532f..0000000000 --- a/mobile/src/operators/kernel/arm/activation_kernel.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/activation_kernel.h" -#include "common/types.h" -#include "operators/kernel/central-arm-func/activation_arm_func.h" -#include "operators/math/activation.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif - -namespace paddle_mobile { -namespace operators { - -#ifdef RELU_OP -template <> -bool ReluKernel::Init(ReluParam *param) { - return true; -} - -template <> -void ReluKernel::Compute(const ReluParam ¶m) { - const LoDTensor *input = param.InputX(); - LoDTensor *output = param.Out(); - ActivationCompute()(input, output); - output->set_lod(input->lod()); -} - -template <> -bool Relu6Kernel::Init(Relu6Param *param) { - return true; -} - -template <> -void Relu6Kernel::Compute(const Relu6Param ¶m) { - const LoDTensor *input = param.InputX(); - LoDTensor *output = param.Out(); - float threshold = param.getThreshold(); - ActivationCompute()(input, output, threshold); - output->set_lod(input->lod()); -} -#endif - -#ifdef SIGMOID_OP -template <> -bool SigmoidKernel::Init(SigmoidParam *param) { - return true; -} - -template <> -void SigmoidKernel::Compute(const SigmoidParam ¶m) { - const LoDTensor *input = param.InputX(); - LoDTensor *output = param.Out(); - ActivationCompute()(input, output); - output->set_lod(input->lod()); -} -#endif - -#ifdef TANH_OP -template <> -bool TanhKernel::Init(TanhParam *param) { - return true; -} - -template <> -void TanhKernel::Compute(const TanhParam ¶m) { - const LoDTensor *input = param.InputX(); - LoDTensor *output = param.Out(); - ActivationCompute()(input, output); - output->set_lod(input->lod()); -} -#endif - -#ifdef LOG_OP -template <> -bool LogKernel::Init(ReluParam *param) { - return true; -} - -template <> -void LogKernel::Compute(const ReluParam ¶m) { - const LoDTensor *input = param.InputX(); - LoDTensor *output = param.Out(); - ActivationCompute()(input, output); - output->set_lod(input->lod()); -} -#endif - -#ifdef LEAKY_RELU_OP -template <> -bool LeakyReluKernel::Init(LeakyReluParam *param) { - return true; -} - -template <> -void LeakyReluKernel::Compute(const LeakyReluParam ¶m) { - const LoDTensor *input = param.InputX(); - LoDTensor *output = param.Out(); - ActivationCompute()(input, output, param.Alpha()); - output->set_lod(input->lod()); -} -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp deleted file mode 100644 index c493d78bb0..0000000000 --- a/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ANCHOR_GENERATOR_OP - -#include -#include "operators/kernel/detection_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool AnchorGeneratorKernel::Init(AnchorGeneratorParam *param) { - return true; -} - -template <> -void AnchorGeneratorKernel::Compute( - const AnchorGeneratorParam ¶m) { - // TODO(hjchen2) -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ANCHOR_GENERATOR_OP diff --git a/mobile/src/operators/kernel/arm/assign_kernel.cpp b/mobile/src/operators/kernel/arm/assign_kernel.cpp deleted file mode 100644 index 823bb3ca41..0000000000 --- a/mobile/src/operators/kernel/arm/assign_kernel.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ASSIGN_OP - -#include "operators/kernel/assign_kernel.h" -#include "framework/data_type.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool AssignKernel::Init(AssignParam* param) { - return true; -} - -template <> -void AssignKernel::Compute(const AssignParam& param) { - const auto* input = param.Input(); - auto* out = param.Output(); - out->mutable_data(); - framework::TensorCopy(*input, out); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ASSIGN_OP diff --git a/mobile/src/operators/kernel/arm/assign_value_kernel.cpp b/mobile/src/operators/kernel/arm/assign_value_kernel.cpp deleted file mode 100644 index 2e98b9f777..0000000000 --- a/mobile/src/operators/kernel/arm/assign_value_kernel.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ASSIGN_VALUE_OP - -#include "operators/kernel/assign_value_kernel.h" -#include "framework/data_type.h" - -namespace paddle_mobile { -namespace operators { - -struct AssignValueOpFunctor { - framework::LoDTensor* output_; - const std::vector shape_; - const std::vector int32_values_; - const std::vector fp32_values_; - - AssignValueOpFunctor(framework::LoDTensor* output, - const std::vector& shape, - const std::vector& fp32_values, - const std::vector& int32_values) - : output_(output), - shape_(shape), - int32_values_(int32_values), - fp32_values_(fp32_values) {} - - template - inline void apply() const { - PADDLE_MOBILE_THROW_EXCEPTION("Assign value: not supported data type."); - } -}; - -template <> -inline void AssignValueOpFunctor::apply() const { - framework::TensorFromVector(int32_values_, output_); - output_->Resize(framework::make_ddim(shape_)); -} - -template <> -inline void AssignValueOpFunctor::apply() const { - framework::TensorFromVector(fp32_values_, output_); - output_->Resize(framework::make_ddim(shape_)); -} - -template <> -bool AssignValueKernel::Init(AssignValueParam* param) { - return true; -} - -template <> -void AssignValueKernel::Compute( - const AssignValueParam& param) { - framework::VisitDataType( - framework::ToDataType(param.dtype_), - AssignValueOpFunctor(param.output_, param.shape_, param.fp32_values_, - param.int32_values_)); -} - -template <> -bool AssignValueKernel::Init(AssignValueParam* param) { - return true; -} - -template <> -void AssignValueKernel::Compute( - const AssignValueParam& param) { - framework::VisitDataType( - framework::ToDataType(param.dtype_), - AssignValueOpFunctor(param.output_, param.shape_, param.fp32_values_, - param.int32_values_)); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ASSIGN_VALUE_OP diff --git a/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp b/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp deleted file mode 100644 index f31c4426db..0000000000 --- a/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BATCHNORM_OP - -#include "operators/kernel/batchnorm_kernel.h" -#include "operators/kernel/central-arm-func/batchnorm_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool BatchNormKernel::Init(BatchNormParam *param) { - return true; -} - -template <> -void BatchNormKernel::Compute(const BatchNormParam ¶m) { - BatchnormCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp b/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp deleted file mode 100644 index 97aaffe7c2..0000000000 --- a/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp +++ /dev/null @@ -1,278 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BEAM_SEARCH_DECODE_OP - -#include "operators/kernel/beam_search_decode_kernel.h" -#include -#include "framework/data_type.h" - -namespace paddle_mobile { -namespace operators { - -using LoDTensor = framework::LoDTensor; -using LoDTensorArray = framework::LoDTensorArray; - -// all the lod have 2 levels. -// The first is source level, the second is sentence level. -// source level describe how many prefixes (branchs) for each source sentece -// (beam). sentence level describe how these candidates belong to the prefixes. -const size_t kSourceLevel = 0; -const size_t kSentenceLevel = 1; - -template -struct Sentence { - std::vector word_ids; - std::vector scores; -}; - -template -using SentenceVector = std::vector>; - -template -struct BeamSearchDecoder { - BeamSearchDecoder(size_t beam_size, int end_id) - : beam_size_(beam_size), end_id_(end_id) {} - - /** - * convert the result sentence_vector for each source sentence into two - * LodTensor. - * One is all candidate sentences with word id, one is all candidate sentences - * with word score. - * Param: - * sentence_vector_list: sentence_vector for each source sentence. - * id_tensor: result LoDTensor for sentences of id. - * score_tensor: result LoDTensor for sentences of score. - * reverse: whether ids of sentence in sentence_vector_list is reversed - * sort_by_score: whether to sort hypotheses of each sentence by scores. - */ - void ConvertSentenceVectorToLodTensor( - std::vector> sentence_vector_list, LoDTensor* id_tensor, - LoDTensor* score_tensor, bool reverse = true, - bool sort_by_score = true) const; - - /** - * Gather the hypotheses for each source sentence by backtrace though the - * LoDTensorArray step_ids whose lods reserve the path in the tree. - */ - void Backtrace(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, LoDTensor* id_tensor, - LoDTensor* score_tensor) const; - - size_t beam_size_; - int end_id_; -}; - -template -void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( - std::vector> sentence_vector_list, LoDTensor* id_tensor, - LoDTensor* score_tensor, bool reverse, bool sort_by_score) const { - size_t src_num = sentence_vector_list.size(); - - PADDLE_MOBILE_ENFORCE(src_num > 0, "src_num should be larger than 0"); - - std::vector source_level_lod = {0}; - std::vector sentence_level_lod = {0}; - std::vector id_data; - std::vector score_data; - - for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { - if (sort_by_score) { - sort(sentence_vector_list[src_idx].begin(), - sentence_vector_list[src_idx].end(), - [reverse](const Sentence& a, const Sentence& b) { - if (reverse) - return a.scores.front() > b.scores.front(); - else - return a.scores.back() > b.scores.back(); - }); - } - for (Sentence& sentence : sentence_vector_list[src_idx]) { - if (reverse) { - id_data.insert(id_data.end(), sentence.word_ids.rbegin(), - sentence.word_ids.rend()); - score_data.insert(score_data.end(), sentence.scores.rbegin(), - sentence.scores.rend()); - } else { - id_data.insert(id_data.end(), sentence.word_ids.begin(), - sentence.word_ids.end()); - score_data.insert(score_data.end(), sentence.scores.begin(), - sentence.scores.end()); - } - - sentence_level_lod.push_back(sentence_level_lod.back() + - sentence.word_ids.size()); - } - source_level_lod.push_back(source_level_lod.back() + - sentence_vector_list[src_idx].size()); - } - - framework::LoD lod; - lod.push_back(source_level_lod); - lod.push_back(sentence_level_lod); - - id_tensor->set_lod(lod); - id_tensor->Resize({static_cast(id_data.size())}); - id_tensor->mutable_data(); - framework::TensorFromVector(id_data, id_tensor); - - score_tensor->set_lod(lod); - score_tensor->Resize({static_cast(score_data.size())}); - score_tensor->mutable_data(); - framework::TensorFromVector(score_data, score_tensor); -} - -template -void BeamSearchDecoder::Backtrace(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, - LoDTensor* score_tensor) const { - PADDLE_MOBILE_ENFORCE(!step_ids.empty(), "step num should be larger than 0"); - PADDLE_MOBILE_ENFORCE(step_ids.size() == step_scores.size(), - "step_ids and step_scores should be the same"); - const size_t step_num = step_ids.size(); - const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; - std::vector> sentence_vector_list( - src_num, SentenceVector(beam_size_)); - std::vector> prefix_idx_vector_list(src_num); - for (int step_id = step_num - 1; step_id >= 0; --step_id) { - auto& cur_ids = step_ids.at(step_id); - auto& cur_scores = step_scores.at(step_id); - for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { - // for each source sentence - auto& sentence_vector = sentence_vector_list.at(src_idx); - auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx); - size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx]; - size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; - if (prefix_idx_vector.empty()) { // be finished and pruned at this step - // or the last time step - for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end; - ++prefix_idx) { - size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - size_t candidate_end = - cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1]; - for (size_t candidate_idx = candidate_start; - candidate_idx < candidate_end; ++candidate_idx) { - prefix_idx_vector.push_back(prefix_idx); - size_t idx = prefix_idx_vector.size() - 1; - auto cur_id = cur_ids.data()[candidate_idx]; - auto cur_score = cur_scores.data()[candidate_idx]; - sentence_vector.at(idx).word_ids.push_back(cur_id); - sentence_vector.at(idx).scores.push_back(cur_score); - } - } - } else { // use prefix_idx_vector to backtrace - size_t src_candidate_start = - cur_ids.lod().at(kSentenceLevel)[src_prefix_start]; - size_t prefix_idx = src_prefix_start; - size_t candidate_num = - cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - - cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { - auto candidate_idx = prefix_idx_vector.at(idx); - auto cur_id = cur_ids.data()[candidate_idx]; - auto cur_score = cur_scores.data()[candidate_idx]; - if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { - // to skip redundant end tokens - sentence_vector.at(idx).word_ids.push_back(cur_id); - sentence_vector.at(idx).scores.push_back(cur_score); - } - - while (src_candidate_start + candidate_num <= - candidate_idx) { // search the corresponding prefix - prefix_idx++; - candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - - cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - } - prefix_idx_vector.at(idx) = prefix_idx; - } - } - } - } - - ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor, - score_tensor, true, true); -} - -struct BeamSearchDecodeFunctor { - BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, LoDTensor* score_tensor, - size_t beam_size, int end_id) - : beam_size_(beam_size), - end_id_(end_id), - step_ids_(step_ids), - step_scores_(step_scores), - id_tensor_(id_tensor), - score_tensor_(score_tensor) {} - - template - void apply() const; - - size_t beam_size_; - int end_id_; - const LoDTensorArray& step_ids_; - const LoDTensorArray& step_scores_; - LoDTensor* id_tensor_; - LoDTensor* score_tensor_; -}; - -template -void BeamSearchDecodeFunctor::apply() const { - BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); - beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_, - score_tensor_); -} - -template <> -void BeamSearchDecodeFunctor::apply() const { - PADDLE_MOBILE_THROW_EXCEPTION("beam search decode op does not support bool."); -} - -template <> -bool BeamSearchDecodeKernel::Init( - BeamSearchDecodeParam* param) { - return true; -} - -template <> -void BeamSearchDecodeKernel::Compute( - const BeamSearchDecodeParam& param) { - const LoDTensorArray* ids = param.ids_; - const LoDTensorArray* scores = param.scores_; - - const size_t step_num = ids->size(); - PADDLE_MOBILE_ENFORCE(step_num > 0, - "beam search steps should be larger than 0"); - - for (size_t i = 0; i < step_num; ++i) { - PADDLE_MOBILE_ENFORCE(ids->at(i).lod().size() == 2, - "Level of LodTensor should be 2"); - } - const size_t source_num = ids->at(0).lod().at(0).size() - 1; - PADDLE_MOBILE_ENFORCE(source_num > 0, "source num should be larger than 0"); - - LoDTensor* sentence_ids = param.sentence_ids_; - LoDTensor* sentence_scores = param.sentence_scores_; - - framework::VisitDataType( - framework::ToDataType(scores->at(0).type()), - BeamSearchDecodeFunctor(*ids, *scores, sentence_ids, sentence_scores, - param.beam_size_, param.end_id_)); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/beam_search_kernel.cpp b/mobile/src/operators/kernel/arm/beam_search_kernel.cpp deleted file mode 100644 index 9128c57c64..0000000000 --- a/mobile/src/operators/kernel/arm/beam_search_kernel.cpp +++ /dev/null @@ -1,262 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BEAM_SEARCH_OP - -#include "operators/kernel/beam_search_kernel.h" -#include -#include - -namespace paddle_mobile { -namespace operators { - -template -class BeamSearchFunctor { - public: - void operator()(const framework::LoDTensor *pre_ids, - const framework::LoDTensor *pre_scores, - const framework::LoDTensor *ids, - const framework::LoDTensor *scores, - framework::LoDTensor *selected_ids, - framework::LoDTensor *selected_scores, - framework::Tensor *parent_idx, size_t level, size_t beam_size, - int end_id, bool is_accumulated) { - auto abs_lod = framework::ToAbsOffset(scores->lod()); - auto &high_level = abs_lod[level]; - - auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level, - beam_size, end_id, is_accumulated); - auto selected_items = ToMap(items, high_level.back()); - - PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id); - // calculate the output tensor's height - size_t num_instances = std::accumulate( - std::begin(selected_items), std::end(selected_items), 0, - [](size_t a, std::vector &b) { return a + b.size(); }); - // the output tensor shape should be [num_instances, 1] - auto dims = framework::make_ddim( - std::vector({static_cast(num_instances), 1})); - selected_ids->Resize(dims); - selected_scores->Resize(dims); - parent_idx->Resize({static_cast(num_instances)}); - - auto *selected_ids_data = selected_ids->mutable_data(); - auto *selected_scores_data = selected_scores->mutable_data(); - auto *parent_idx_data = parent_idx->mutable_data(); - - // fill in data - std::vector low_level; - size_t low_offset = 0; - for (auto &items : selected_items) { - low_level.push_back(low_offset); - for (auto &item : items) { - parent_idx_data[low_offset] = static_cast(low_level.size() - 1); - selected_ids_data[low_offset] = item.id; - selected_scores_data[low_offset] = item.score; - low_offset++; - } - } - low_level.push_back(low_offset); - - // fill lod - framework::LoD lod(2); - lod[0].assign(high_level.begin(), high_level.end()); - lod[1].assign(low_level.begin(), low_level.end()); - selected_ids->set_lod(lod); - selected_scores->set_lod(lod); - } - - /* - * The basic items help to sort. - */ - struct Item { - Item() {} - Item(size_t offset, size_t id, float score) - : offset(offset), id(id), score(score) {} - // offset in the higher lod level. - size_t offset; - // prefix id in the lower lod level. - // size_t prefix; - // the candidate id - size_t id; - // the corresponding score - float score; - - inline bool operator<(const Item &in) const { - return (score < in.score) || - ((score == in.score) && (offset < in.offset)); - } - - inline void operator=(const Item &in) { - offset = in.offset; - id = in.id; - score = in.score; - } - }; - - protected: - /* - * Prune the source sentences all branchs finished, and it is optional. - * Pruning must one step later than finishing (thus pre_ids is needed here), - * since the end tokens must be writed out. - */ - void PruneEndBeams(const framework::LoDTensor *pre_ids, - const framework::LoD &abs_lod, - std::vector> *items, size_t lod_level, - int end_id) { - auto *pre_ids_data = pre_ids->data(); - auto &high_level = abs_lod[lod_level]; - for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { - size_t src_prefix_start = high_level[src_idx]; - size_t src_prefix_end = high_level[src_idx + 1]; - bool finish_flag = true; - for (size_t offset = src_prefix_start; offset < src_prefix_end; - offset++) { - for (auto &item : items->at(offset)) { - if (item.id != static_cast(end_id) || - pre_ids_data[offset] != end_id) { - finish_flag = false; - break; - } - } - if (!finish_flag) break; - } - if (finish_flag) { // all branchs of the beam (source sentence) end and - // prune this beam - for (size_t offset = src_prefix_start; offset < src_prefix_end; - offset++) - items->at(offset).clear(); - } - } - } - - /* - * Transform the items into a map whose key is offset, value is the items. - * NOTE low performance. - */ - std::vector> ToMap( - const std::vector> &items, size_t element_num) { - std::vector> result; - result.resize(element_num); - for (auto &entries : items) { - for (const auto &item : entries) { - result[item.offset].push_back(item); - } - } - return result; - } - - void Insert(std::vector *top_beam_ptr, const Item &item, - size_t beam_size) { - std::vector &top_beam = *top_beam_ptr; - - size_t num_beams = top_beam.size(); - if (num_beams < beam_size) { - top_beam.resize(num_beams + 1); - num_beams++; - } else { - if (item < top_beam[beam_size - 1]) { - return; - } - } - - for (int k = static_cast(num_beams) - 2; k >= 0; --k) { - if (top_beam[k] < item) { - top_beam[k + 1] = top_beam[k]; - } else { - top_beam[k + 1] = item; - return; - } - } - top_beam[0] = item; - } - - /* - * For each source, select top beam_size records. - */ - std::vector> SelectTopBeamSizeItems( - const framework::LoDTensor *pre_ids, - const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids, - const framework::LoDTensor *scores, size_t lod_level, size_t beam_size, - int end_id, bool is_accumulated) { - std::vector> result; - - // find the current candidates - auto abs_lod = framework::ToAbsOffset(scores->lod()); - - auto *pre_ids_data = pre_ids->data(); - auto *pre_scores_data = pre_scores->data(); - - auto *ids_data = ids ? ids->data() : nullptr; - auto *scores_data = scores->data(); - - size_t num_seqs = scores->NumElements(lod_level); - size_t seq_width = 1; - for (int i = 1; i < scores->dims().size(); i++) { - seq_width *= scores->dims()[i]; - } - - for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) { - size_t seq_offset_start = abs_lod[lod_level][seq_id]; - size_t seq_offset_end = abs_lod[lod_level][seq_id + 1]; - - std::vector top_beam; - top_beam.reserve(beam_size); - - for (size_t offset = seq_offset_start; offset < seq_offset_end; - ++offset) { - auto pre_id = pre_ids_data[offset]; - auto pre_score = pre_scores_data[offset]; - if (pre_id == end_id) { - // Allocate all probability mass to end_id for finished branchs and - // the other candidate ids can be ignored. - Item item(offset, end_id, pre_score); - Insert(&top_beam, item, beam_size); - } else { - size_t index = offset * seq_width; - for (size_t d = 0; d < seq_width; d++, index++) { - int64_t id = ids_data ? ids_data[index] : static_cast(d); - float score = is_accumulated - ? scores_data[index] - : pre_score + std::log(scores_data[index]); - Item item(offset, id, score); - Insert(&top_beam, item, beam_size); - } - } - } - - result.emplace_back(top_beam); - } - - return result; - } -}; - -template <> -bool BeamSearchKernel::Init(BeamSearchParam *param) { - return true; -} - -template <> -void BeamSearchKernel::Compute(const BeamSearchParam ¶m) { - BeamSearchFunctor alg; - alg(param.pre_ids_, param.pre_scores_, param.ids_, param.scores_, - param.selected_ids_, param.selected_scores_, param.parent_idx_, - param.level_, param.beam_size_, param.end_id_, param.is_accumulated_); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp b/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp deleted file mode 100644 index 85192e28ed..0000000000 --- a/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BILINEAR_INTERP_OP - -#include "operators/kernel/bilinear_interp_kernel.h" -#include "operators/kernel/central-arm-func/bilinear_interp_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool BilinearInterpKernel::Init(BilinearInterpParam *param) { - return true; -} - -template <> -void BilinearInterpKernel::Compute( - const BilinearInterpParam ¶m) { - BilinearInterpCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/box_coder_kernel.cpp b/mobile/src/operators/kernel/arm/box_coder_kernel.cpp deleted file mode 100644 index 30ede12dff..0000000000 --- a/mobile/src/operators/kernel/arm/box_coder_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BOXCODER_OP - -#include "operators/kernel/box_coder_kernel.h" -#include "operators/kernel/central-arm-func/box_coder_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool BoxCoderKernel::Init(BoxCoderParam *param) { - return true; -} - -template <> -void BoxCoderKernel::Compute(const BoxCoderParam ¶m) { - BoxCoderCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/cast_kernel.cpp b/mobile/src/operators/kernel/arm/cast_kernel.cpp deleted file mode 100644 index 166e821172..0000000000 --- a/mobile/src/operators/kernel/arm/cast_kernel.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CAST_OP - -#include -#include -#include "framework/data_type.h" -#include "operators/kernel/kernels.h" - -namespace paddle_mobile { -namespace operators { - -template -struct CastOutOpFunctor { - const framework::Tensor* in_; - framework::Tensor* out_; - CastOutOpFunctor(const framework::Tensor* in, framework::Tensor* out) - : in_(in), out_(out) {} - - template - void apply() const { - const InT* input = in_->data(); - OutT* output = out_->mutable_data(); - size_t numel = in_->numel(); - for (int i = 0; i < numel; ++i) { - output[i] = static_cast(input[i]); - } - } -}; - -// struct CastOpFunctor { -// const framework::Tensor* in_; -// framework::Tensor* out_; -// int output_type_; -// CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, -// const int output_type) -// : in_(in), out_(out), output_type_(output_type) {} -// -// template -// void apply() const { -// framework::VisitDataType(framework::ToDataType(output_type_), -// CastOutOpFunctor(in_, out_)); -// } -//}; - -template <> -bool CastKernel::Init(CastParam* param) { - return true; -} - -template <> -void CastKernel::Compute(const CastParam& param) { - const Tensor* input = param.input_; - Tensor* output = param.output_; - if (input->type() == type_id()) { - framework::VisitDataType(framework::ToDataType(param.output_type_), - CastOutOpFunctor(input, output)); - } else if (input->type() == type_id()) { - framework::VisitDataType(framework::ToDataType(param.output_type_), - CastOutOpFunctor(input, output)); - } else if (input->type() == type_id()) { - framework::VisitDataType(framework::ToDataType(param.output_type_), - CastOutOpFunctor(input, output)); - } else { - PADDLE_MOBILE_ENFORCE(0, "input tpye not support now!") - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // CAST_OP diff --git a/mobile/src/operators/kernel/arm/compare_kernel.cpp b/mobile/src/operators/kernel/arm/compare_kernel.cpp deleted file mode 100644 index d321740fd2..0000000000 --- a/mobile/src/operators/kernel/arm/compare_kernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/compare_kernel.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif - -namespace paddle_mobile { -namespace operators { - -typedef enum { - LESS_THAN = 0, - LESS_EQUAL = 1, - GREATER_THAN = 2, - GREATER_EQUAL = 3, - EQUAL = 4, - NOT_EQUAL = 5, -} CompareType; - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -template -inline uint32x4_t vcmpq_f32(const float32x4_t x, const float32x4_t y) { - return vcleq_f32(x, y); -} -#endif - -template -inline uint8_t Compare(const float x, const float y) { - return static_cast(x < y); -} - -template -inline uint8_t Compare(const int x, const int y) { - return static_cast(x == y); -} - -template -inline uint8_t Compare(const int64_t x, const int64_t y) { - return static_cast(x < y); -} - -template -struct CompareCompute { - void operator()(const Tensor *X, const Tensor *Y, const int Axis, - Tensor *Out) {} -}; - -template -struct CompareCompute { - void operator()(const Tensor *X, const Tensor *Y, const int Axis, - Tensor *Out) { - const float *x = X->data(); - const float *y = Y->data(); - uint8_t *output = reinterpret_cast(Out->mutable_data()); - const auto &x_dims = X->dims(); - const auto &y_dims = Y->dims(); - /// axis = -1 represent the last dimensions. - int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis); - int batch = 1; - int channels = 1; - int elementwise_num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - elementwise_num *= x_dims[i]; - } - // if elementwise_num == 1, compare rowwise - if (elementwise_num == 1) { - int remain_start = 0; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - remain_start = channels & 0xfffffff8; - uint8x8_t __mask = vdup_n_u8(0x1); - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels - 7; j += 8) { - int x_offset = i * channels + j; - float32x4_t __x0 = vld1q_f32(x + x_offset); - float32x4_t __x1 = vld1q_f32(x + x_offset + 4); - float32x4_t __y0 = vld1q_f32(y + j); - float32x4_t __y1 = vld1q_f32(y + j + 4); - uint32x4_t __cmp0 = vcmpq_f32(__x0, __y0); - uint32x4_t __cmp1 = vcmpq_f32(__x1, __y1); - uint16x4_t __ncmp0 = vmovn_u32(__cmp0); - uint16x4_t __ncmp1 = vmovn_u32(__cmp1); - uint16x8_t __ncmp = vcombine_u16(__ncmp0, __ncmp1); - uint8x8_t __nncmp = vmovn_u16(__ncmp); - __nncmp = vand_u8(__nncmp, __mask); - vst1_u8(output + x_offset, __nncmp); - } - } -#endif // __ARM_NEON__ - for (int i = 0; i < batch; ++i) { - for (int j = remain_start; j < channels; ++j) { - int x_offset = i * channels + j; - output[x_offset] = Compare(x[x_offset], y[j]); - } - } - } else { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int x_offset = (i * channels + j) * elementwise_num; - int y_offset = j * elementwise_num; - int remain_start = 0; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - remain_start = elementwise_num & 0xfffffff8; - uint8x8_t __mask = vdup_n_u8(0x1); - for (int k = 0; k < elementwise_num - 7; k += 8) { - float32x4_t __x0 = vld1q_f32(x + x_offset); - float32x4_t __x1 = vld1q_f32(x + x_offset + 4); - float32x4_t __y0 = vld1q_f32(y + y_offset); - uint32x4_t __cmp0 = vcmpq_f32(__x0, __y0); - uint32x4_t __cmp1 = vcmpq_f32(__x1, __y0); - uint16x4_t __ncmp0 = vmovn_u32(__cmp0); - uint16x4_t __ncmp1 = vmovn_u32(__cmp1); - uint16x8_t __ncmp = vcombine_u16(__ncmp0, __ncmp1); - uint8x8_t __nncmp = vmovn_u16(__ncmp); - __nncmp = vand_u8(__nncmp, __mask); - vst1_u8(output + x_offset, __nncmp); - x_offset += 8; - y_offset += 8; - } -#endif // __ARM_NEON__ - for (int k = remain_start; k < elementwise_num; ++k) { - output[x_offset + k] = Compare(x[x_offset + k], y[y_offset]); - } - } - } - } - } -}; - -template -struct CompareCompute { - void operator()(const Tensor *X, const Tensor *Y, const int Axis, - Tensor *Out) { - const int64_t *x = X->data(); - const int64_t *y = Y->data(); - uint8_t *output = reinterpret_cast(Out->mutable_data()); - const auto &x_dims = X->dims(); - const auto &y_dims = Y->dims(); - /// axis = -1 represent the last dimensions. - int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis); - int batch = 1; - int channels = 1; - int elementwise_num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - elementwise_num *= x_dims[i]; - } - // if elementwise_num == 1, compare rowwise - if (elementwise_num == 1) { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int x_offset = i * channels + j; - output[x_offset] = Compare(x[x_offset], y[j]); - } - } - } else { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int x_offset = (i * channels + j) * elementwise_num; - int y_offset = j * elementwise_num; - for (int k = 0; k < elementwise_num; ++k) { - output[x_offset + k] = Compare(x[x_offset + k], y[y_offset]); - } - } - } - } - } -}; - -template -struct CompareCompute { - void operator()(const Tensor *X, const Tensor *Y, const int Axis, - Tensor *Out) { - const int *x = X->data(); - const int *y = Y->data(); - uint8_t *output = reinterpret_cast(Out->mutable_data()); - const auto &x_dims = X->dims(); - const auto &y_dims = Y->dims(); - /// axis = -1 represent the last dimensions. - int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis); - int batch = 1; - int channels = 1; - int elementwise_num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - elementwise_num *= x_dims[i]; - } - // if elementwise_num == 1, compare rowwise - if (elementwise_num == 1) { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int x_offset = i * channels + j; - output[x_offset] = Compare(x[x_offset], y[j]); - } - } - } else { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int x_offset = (i * channels + j) * elementwise_num; - int y_offset = j * elementwise_num; - for (int k = 0; k < elementwise_num; ++k) { - output[x_offset + k] = Compare(x[x_offset + k], y[y_offset]); - } - } - } - } - } -}; - -#ifdef LESS_THAN_OP -template <> -bool LessThanKernel::Init(CompareParam *param) { - return true; -} - -template <> -void LessThanKernel::Compute(const CompareParam ¶m) { - if (param.input_x_->type() == type_id().hash_code()) { - CompareCompute()(param.input_x_, param.input_y_, - param.axis_, param.output_); - } else if (param.input_x_->type() == type_id().hash_code()) { - CompareCompute()(param.input_x_, param.input_y_, - param.axis_, param.output_); - } else { - PADDLE_MOBILE_THROW_EXCEPTION( - "LessThan only support int64_t and float data type."); - } -} -#endif // LESS_THAN_OP - -#ifdef EQUAL_OP -template <> -bool EqualKernel::Init(CompareParam *param) { - return true; -} - -template <> -void EqualKernel::Compute(const CompareParam ¶m) { - if (param.input_x_->type() == type_id().hash_code()) { - CompareCompute()(param.input_x_, param.input_y_, param.axis_, - param.output_); - } -} -#endif // EQUAL_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/concat_kernel.cpp b/mobile/src/operators/kernel/arm/concat_kernel.cpp deleted file mode 100644 index 3e585ec721..0000000000 --- a/mobile/src/operators/kernel/arm/concat_kernel.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONCAT_OP - -#include "operators/kernel/concat_kernel.h" -#include "operators/kernel/central-arm-func/concat_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConcatKernel::Init(ConcatParam *param) { - return true; -} - -template <> -void ConcatKernel::Compute(const ConcatParam ¶m) { - if (param.Inputs()[0]->type() == type_id().hash_code()) { - ConcatCompute(param); - } else { - ConcatCompute(param); - } - param.Out()->set_lod(param.Inputs()[0]->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp b/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp deleted file mode 100644 index a5530559d1..0000000000 --- a/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONDITIONAL_BLOCK_OP - -#include "operators/kernel/conditional_block_kernel.h" -#include -#include -#include -#include "framework/data_type.h" - -namespace paddle_mobile { -namespace operators { - -class StepExecutor { - typedef std::shared_ptr> OperatorPtr; - - public: - StepExecutor(const framework::BlockDesc *block, framework::Scope *scope) - : scope_(scope) { - std::vector> ops = block->Ops(); - ops_of_block_.resize(ops.size()); - for (int i = 0; i < ops.size(); ++i) { - std::shared_ptr op_desc = ops[i]; - DLOG << "conditional block create op: " << ops.size() << "," - << op_desc->Type(); - auto op_handler = framework::OpRegistry::CreateOp( - op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(), - op_desc->GetAttrMap(), scope_); - op_handler->Init(); - ops_of_block_[i] = op_handler; - } - } - - void Run() { - for (int i = 0; i < ops_of_block_.size(); ++i) { - auto &op_handler = ops_of_block_[i]; - DLOG << "conditional block op InferShape: " << i - << "th: " << op_handler->Type(); - op_handler->InferShape(); - DLOG << "conditional block op Run: " << i << "th: " << op_handler->Type(); - op_handler->Run(); - } - } - - private: - framework::Scope *scope_; - std::vector ops_of_block_; -}; - -template <> -bool ConditionalBlockKernel::Init( - ConditionalBlockParam *param) { - return true; -} - -template <> -void ConditionalBlockKernel::Compute( - const ConditionalBlockParam ¶m) { - bool need_run; - if (param.isScalarCondition()) { - auto xs = param.Cond(); - PADDLE_MOBILE_ENFORCE( - xs[0]->type() == type_id().hash_code() && xs[0]->numel() == 1, - "condition input's data type should be bool, " - "numel should be 1, actual numel is %d", - xs[0]->numel()); - need_run = xs[0]->data()[0]; - } else { - auto xs = param.Input(); - need_run = std::all_of( - xs.begin(), xs.end(), - [](const framework::LoDTensor *t) { return t->numel() != 0; }); - } - - if (need_run) { - auto input = param.Input(); - auto sub = param.getSubBlock(); - auto ¤t_scope = param.GetScope()->NewScope(); - StepExecutor executor(sub, ¤t_scope); - executor.Run(); - param.GetScope()->DeleteScope(¤t_scope); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // CONDITIONAL_BLOCK_OP diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp deleted file mode 100644 index 229b96b550..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp +++ /dev/null @@ -1,178 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBNRELU_OP - -#include "operators/kernel/conv_add_bn_relu_kernel.h" -#include -#include "framework/context.h" -#include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" -#include "operators/math/element_wise.h" -#include "operators/math/gemm/gemm1x1s1.h" -#include "operators/math/slidingwindow_utils.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddBNReluKernel::Init( - FusionConvAddBNReluParam *param) { - const Tensor *mean = param->InputMean(); - const Tensor *variance = param->InputVariance(); - const Tensor *scale = param->InputScale(); - const Tensor *bias = param->InputBias(); - const Tensor *bias1 = param->Bias(); - const float epsilon = param->Epsilon(); - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - auto bias1_ptr = bias1->data(); - - const int C = mean->numel(); - float inv_std_ptr[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - - Variable *scale_var = param->GetScope()->Var(); - Variable *bias_var = param->GetScope()->Var(); - LoDTensor *new_scale = scale_var->GetMutable(); - LoDTensor *new_bias = bias_var->GetMutable(); - float *new_scale_ptr = new_scale->mutable_data({C}); - float *new_bias_ptr = new_bias->mutable_data({C}); - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] + (bias1_ptr[i] - mean_ptr[i]) * - inv_std_ptr[i] * scale_ptr[i]; - } - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); - - InitBaseConvKernel(param); - - // try to use faster depthwise conv - switch (param->ExecMode()) { - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - use_slidingwindow_add_bn_relu = true; - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - use_gemm_add_bn_relu = true; - break; - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - const std::vector &paddings = param->Paddings(); - const std::vector &strides = param->Strides(); - if (paddings.size() == 2 && paddings[0] == paddings[1] && - strides.size() == 2 && strides[0] == strides[1]) { - int pad = paddings[0]; - int stride = strides[0]; - const int win = param->Input()->dims()[3]; - if (pad == 1) { - if (stride == 1) { - could_use_faster_depthwise_conv_ = true; - } else if (stride == 2 && win > 7) { - could_use_faster_depthwise_conv_ = true; - } - } - } - break; - } - - if (could_use_faster_depthwise_conv_ || use_gemm_add_bn_relu || - use_slidingwindow_add_bn_relu) { - auto filter_data = param->Filter()->data(); - auto filter_dim = param->Filter()->dims(); - int len = 1; - for (int i = 0; i < filter_dim.size(); i++) { - len *= filter_dim[i]; - } - int batch = filter_dim[0]; - int step = len / batch; - for (int i = 0; i < batch; i++) { - for (int k = 0; k < step; k++) { - filter_data[i * step + k] = - filter_data[i * step + k] * new_scale_ptr[i]; - } - } - if (use_gemm_add_bn_relu) { - ARMArch arch = framework::CPUContext::Context()->get_arch(); - math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(), - param->transformed_filter_, - param->groups, arch); - } - if (use_slidingwindow_add_bn_relu) { - math::slidingwindow_transform_weight(*param->Filter(), - param->transformed_filter_); - } - } - - return true; -} - -template <> -void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) { - bool fusion_has_been_computed = false; - switch (param.ExecMode()) { - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - if (could_use_faster_depthwise_conv_) { - FasterDepthwiseConv3x3_bias_relu(param, param.NewBias()->data(), - true); - fusion_has_been_computed = true; - } else { - DepthwiseConv3x3(param); - } - break; - case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: - DepthwiseConv5x5(param); - break; - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<8, 3>(param); - break; - case ConvParam::EXEC_GEMM_FLOAT: - GemmConv(param); - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - fusion_has_been_computed = true; - GemmConv1x1s1(param, param.NewBias()->data(), true, - true); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - SlidingwindowConv3x3(param, param.NewBias()->data(), - true, true); - fusion_has_been_computed = true; - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } - if (!fusion_has_been_computed) { - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); - } -} - -template class ConvAddBNReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp deleted file mode 100644 index 66ed513ac9..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP - -#include "operators/kernel/conv_add_kernel.h" -#include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" -#include "operators/math/element_wise.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddKernel::Init(FusionConvAddParam *param) { - InitBaseConvKernel(param); - return true; -} - -template <> -void ConvAddKernel::Compute(const FusionConvAddParam ¶m) { - bool fusion_has_been_computed = false; - switch (param.ExecMode()) { - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - DepthwiseConv3x3(param); - break; - case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: - DepthwiseConv5x5(param); - break; - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<8, 3>(param); - break; - case ConvParam::EXEC_GEMM_FLOAT: - GemmConv(param); - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - fusion_has_been_computed = true; - GemmConv1x1s1(param, param.Bias()->data(), true, - false); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - SlidingwindowConv3x3(param, param.Bias()->data(), - true, false); - fusion_has_been_computed = true; - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } - if (!fusion_has_been_computed) { - if (param.Bias()->dims() == param.Output()->dims()) { - math::AddElememtWise(param.Output(), param.Bias(), param.Axis(), - param.Output()); - } else { - math::AddChannelWise(param.Output(), param.Bias(), - param.Output()); - } - } -} - -template class ConvAddKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp deleted file mode 100644 index 54eb2ca23b..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDRELU_OP - -#include "operators/kernel/conv_add_relu_kernel.h" -#include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" -#include "operators/math/element_wise.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { - InitBaseConvKernel(param); - return true; -} - -template <> -void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) { - bool fusion_has_been_computed = false; - switch (param.ExecMode()) { - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - DepthwiseConv3x3(param); - break; - case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: - DepthwiseConv5x5(param); - break; - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<8, 3>(param); - break; - case ConvParam::EXEC_GEMM_FLOAT: - GemmConv(param); - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - fusion_has_been_computed = true; - GemmConv1x1s1(param, param.Bias()->data(), true, - true); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - SlidingwindowConv3x3(param, nullptr, false, false); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } - if (!fusion_has_been_computed) { - if (param.Bias()->dims() == param.Output()->dims()) { - math::AddElememtWise(param.Output(), param.Bias(), param.Axis(), - param.Output()); - } else { - math::AddChannelWise(param.Output(), param.Bias(), param.Output()); - } - } -} - -template class ConvAddReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp deleted file mode 100644 index 138e34d78e..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNADDRELU_OP - -#include "operators/kernel/conv_bn_add_relu_kernel.h" -#include -#include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" -#include "operators/math/element_wise.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvBNAddReluKernel::Init( - FusionConvBNAddReluParam *param) { - const Tensor *mean = param->InputMean(); - const Tensor *variance = param->InputVariance(); - const Tensor *scale = param->InputScale(); - const Tensor *bias = param->InputBias(); - const float epsilon = param->Epsilon(); - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = const_cast(scale->data()); - auto bias_ptr = const_cast(bias->data()); - - for (int c = 0; c < scale->numel(); ++c) { - float inv_scale = 1.f / (pow(variance_ptr[c] + epsilon, 0.5)); - bias_ptr[c] -= inv_scale * scale_ptr[c] * mean_ptr[c]; - scale_ptr[c] *= inv_scale; - } - - InitBaseConvKernel(param); - return true; -} - -template <> -void ConvBNAddReluKernel::Compute( - const FusionConvBNAddReluParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - DepthwiseConv3x3(param); - break; - case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: - DepthwiseConv5x5(param); - break; - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<8, 3>(param); - break; - case ConvParam::EXEC_GEMM_FLOAT: - GemmConv(param); - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - GemmConv1x1s1(param, nullptr, false, false); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - SlidingwindowConv3x3(param, nullptr, false, false); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } - - if (param.Bias()->dims() == param.Output()->dims()) { - math::ScaleAddChannelWise(param.Output(), param.InputScale(), - param.InputBias(), param.Bias(), - param.Output()); - } else { - math::ScaleAddChannelWise(param.Output(), param.InputScale(), - param.InputBias(), param.Output()); - math::AddElememtWise(param.Output(), param.Bias(), param.Axis(), - param.Output()); - } -} - -template class ConvBNAddReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp deleted file mode 100644 index f217902bf2..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNRELU_OP - -#include "operators/kernel/conv_bn_relu_kernel.h" -#include -#include "framework/context.h" -#include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" -#include "operators/math/element_wise.h" -#include "operators/math/gemm/gemm1x1s1.h" -#include "operators/math/slidingwindow_utils.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { - const Tensor *mean = param->InputMean(); - const Tensor *variance = param->InputVariance(); - const Tensor *scale = param->InputScale(); - const Tensor *bias = param->InputBias(); - const float epsilon = param->Epsilon(); - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - - const int C = mean->numel(); - float inv_std_ptr[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - - Variable *scale_var = param->GetScope()->Var(); - Variable *bias_var = param->GetScope()->Var(); - LoDTensor *new_scale = scale_var->GetMutable(); - LoDTensor *new_bias = bias_var->GetMutable(); - float *new_scale_ptr = new_scale->mutable_data({C}); - float *new_bias_ptr = new_bias->mutable_data({C}); - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; - } - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); - - InitBaseConvKernel(param); - - switch (param->ExecMode()) { - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - use_slidingwindow_bn_relu = true; - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - use_gemm_bn_relu = true; - break; - } - - if (use_gemm_bn_relu || use_slidingwindow_bn_relu) { - auto filter_data = param->Filter()->data(); - auto filter_dim = param->Filter()->dims(); - int len = 1; - for (int i = 0; i < filter_dim.size(); i++) { - len *= filter_dim[i]; - } - int batch = filter_dim[0]; - int step = len / batch; - for (int i = 0; i < batch; i++) { - for (int k = 0; k < step; k++) { - filter_data[i * step + k] = - filter_data[i * step + k] * new_scale_ptr[i]; - } - } - if (use_gemm_bn_relu) { - ARMArch arch = framework::CPUContext::Context()->get_arch(); - math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(), - param->transformed_filter_, - param->groups, arch); - } - if (use_slidingwindow_bn_relu) { - math::slidingwindow_transform_weight(*param->Filter(), - param->transformed_filter_); - } - } - return true; -} - -template <> -void ConvBNReluKernel::Compute( - const FusionConvBNReluParam ¶m) { - bool fusion_has_been_computed = false; - switch (param.ExecMode()) { - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - DepthwiseConv3x3(param); - break; - case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: - DepthwiseConv5x5(param); - break; - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<8, 3>(param); - break; - case ConvParam::EXEC_GEMM_FLOAT: - GemmConv(param); - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - GemmConv1x1s1(param, param.NewBias()->data(), true, - true); - fusion_has_been_computed = true; - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - SlidingwindowConv3x3(param, param.NewBias()->data(), - true, true); - fusion_has_been_computed = true; - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } - if (!fusion_has_been_computed) { - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); - } -} -template class ConvBNReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp b/mobile/src/operators/kernel/arm/convolution/conv_common.cpp deleted file mode 100644 index dd3843afef..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#include "operators/kernel/arm/convolution/conv_common.h" -#include "framework/context.h" -#include "operators/math/gemm/gemm1x1s1.h" -#include "operators/math/slidingwindow_utils.h" -#include "operators/math/winograd/winograd_transform.h" - -namespace paddle_mobile { -namespace operators { - -void InitBaseConvKernel(ConvParam *param) { - bool conv1x1 = param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Filter()->dims()[2] == 1; - bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Filter()->dims()[2] == 3; - bool conv5x5 = param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Filter()->dims()[2] == 5; - bool depth3x3 = conv3x3 && param->Groups() == param->Input()->dims()[1] && - param->Input()->dims()[1] == param->Output()->dims()[1]; - - bool depth5x5 = conv5x5 && param->Groups() == param->Input()->dims()[1] && - param->Input()->dims()[1] == param->Output()->dims()[1]; - - if (param->Filter()->type() == type_id().hash_code()) { -#ifndef __aarch64__ - if (depth3x3 && param->Strides()[0] < 3 && - param->Strides()[0] == param->Strides()[1]) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_INT8; - } else if (depth5x5 && param->Strides()[0] < 2 && - param->Strides()[0] == param->Strides()[1]) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE5x5_INT8; - } else { -#endif // __aarch64__ - param->ExecMode() = ConvParam::EXEC_GEMM_INT8; -#ifndef __aarch64__ - } -#endif // __aarch64__ - } else { - if (depth3x3 && param->Strides()[0] == param->Strides()[1] && - param->Strides()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT; - } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] && - param->Strides()[0] == 2) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT; - } else if (depth5x5 && param->Strides()[0] == param->Strides()[1] && - param->Strides()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE5x5_FLOAT; - } else if (conv3x3 && param->Groups() == 1 && - param->Strides()[0] == param->Strides()[1] && - param->Dilations()[0] == param->Dilations()[1] && - param->Strides()[0] == 1 && param->Dilations()[0] == 1) { - // transform weight - Variable *transformed_var = param->GetScope()->Var(); - param->transformed_filter_ = - transformed_var->GetMutable(); - if (param->Input()->dims()[1] >= 32 && param->Output()->dims()[1] >= 32 && - param->Output()->dims()[2] > 16 && param->Output()->dims()[3] > 16) { - math::winograd_transform_weight<8, 3>(*param->Filter(), - param->transformed_filter_); - param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; - } else { - math::slidingwindow_transform_weight(*param->Filter(), - param->transformed_filter_); - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT; - } - } else if (conv3x3 && param->Groups() == 1 && - param->Strides()[0] == param->Strides()[1] && - param->Dilations()[0] == param->Dilations()[1] && - param->Strides()[0] == 2 && param->Dilations()[0] == 1) { - // transform weight - Variable *transformed_var = param->GetScope()->Var(); - param->transformed_filter_ = - transformed_var->GetMutable(); - math::slidingwindow_transform_weight(*param->Filter(), - param->transformed_filter_); - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT; - } else if (conv1x1 && param->Groups() == 1 && - param->Paddings()[0] == param->Paddings()[1] && - param->Paddings()[0] == 0 && param->Input()->dims()[1] > 1 && - param->Strides()[0] == param->Strides()[1] && - param->Dilations()[0] == param->Dilations()[1] && - param->Strides()[0] == 1 && param->Dilations()[0] == 1 && - param->Output()->dims()[2] * param->Output()->dims()[3] > 1) { - // transform weight - Variable *transformed_var = param->GetScope()->Var(); - ARMArch arch = framework::CPUContext::Context()->get_arch(); - param->transformed_filter_ = - transformed_var->GetMutable(); - math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(), - param->transformed_filter_, - param->groups, arch); - param->ExecMode() = ConvParam::EXEC_GEMM1x1s1_FLOAT; - } else { - param->ExecMode() = ConvParam::EXEC_GEMM_FLOAT; - } - } -} - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/conv_common.h b/mobile/src/operators/kernel/arm/convolution/conv_common.h deleted file mode 100644 index 4db37715c4..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_common.h +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -void InitBaseConvKernel(ConvParam *param); - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp deleted file mode 100644 index f5dc35cdf6..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#include "operators/kernel/conv_kernel.h" -#include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvKernel::Init(ConvParam *param) { - InitBaseConvKernel(param); - return true; -} - -template <> -void ConvKernel::Compute(const ConvParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_GEMM_INT8: - GemmConv(param); - break; -#ifndef __aarch64__ - case ConvParam::EXEC_DEPTHWISE3x3_INT8: - DepthwiseConv3x3(param); - break; - case ConvParam::EXEC_DEPTHWISE5x5_INT8: - DepthwiseConv5x5(param); - break; -#endif // __aarch64__ - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - DepthwiseConv3x3(param); - break; - case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: - DepthwiseConv5x5(param); - break; - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<8, 3>(param); - break; - case ConvParam::EXEC_GEMM_FLOAT: - GemmConv(param); - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - GemmConv1x1s1(param, nullptr, false, false); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - SlidingwindowConv3x3(param, nullptr, false, false); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } -} - -template class ConvKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp deleted file mode 100644 index 477bd55e55..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVRELU_OP - -#include "operators/kernel/conv_relu_kernel.h" -#include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/activation_arm_func.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvReluKernel::Init(FusionConvReluParam *param) { - InitBaseConvKernel(param); - return true; -} - -template <> -void ConvReluKernel::Compute( - const FusionConvReluParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - DepthwiseConv3x3(param); - break; - case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: - DepthwiseConv5x5(param); - break; - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<8, 3>(param); - break; - case ConvParam::EXEC_GEMM_FLOAT: - GemmConv(param); - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - GemmConv1x1s1(param, nullptr, false, false); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3S2_FLOAT: - SlidingwindowConv3x3(param, nullptr, false, false); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } - ActivationCompute()(param.Output(), param.Output()); -} -template class ConvReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp deleted file mode 100644 index 771a846ed6..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_TRANSPOSE_OP - -#include "operators/kernel/conv_transpose_kernel.h" -#include "operators/kernel/central-arm-func/conv_transpose_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvTransposeKernel::Init(ConvTransposeParam *param) { - return true; -} - -template <> -void ConvTransposeKernel::Compute( - const ConvTransposeParam ¶m) { - ConvTransposeCompute(param); -} - -template class ConvTransposeKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp deleted file mode 100644 index 0eefeae1d1..0000000000 --- a/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DWCONVBNRELU_OP - -#include "operators/kernel/dwconv_bn_relu_kernel.h" -#include -#include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/conv_arm_func.h" -#include "operators/math/element_wise.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DWConvBNReluKernel::Init(FusionDWConvBNReluParam *param) { - const Tensor *mean = param->InputMean(); - const Tensor *variance = param->InputVariance(); - const Tensor *scale = param->InputScale(); - const Tensor *bias = param->InputBias(); - const float epsilon = param->Epsilon(); - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - - const int C = mean->numel(); - float inv_std_ptr[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - Variable *scale_var = param->GetScope()->Var(); - Variable *bias_var = param->GetScope()->Var(); - LoDTensor *new_scale = scale_var->GetMutable(); - LoDTensor *new_bias = bias_var->GetMutable(); - float *new_scale_ptr = new_scale->mutable_data({C}); - float *new_bias_ptr = new_bias->mutable_data({C}); - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; - } - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); - - InitBaseConvKernel(param); - return true; -} - -template <> -void DWConvBNReluKernel::Compute( - const FusionDWConvBNReluParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - DepthwiseConv3x3(param); - break; - case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: - DepthwiseConv5x5(param); - break; - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<8, 3>(param); - break; - case ConvParam::EXEC_GEMM_FLOAT: - GemmConv(param); - break; - case ConvParam::EXEC_GEMM1x1s1_FLOAT: - GemmConv1x1s1(param, nullptr, false, false); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); -} - -template class DWConvBNReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/crf_kernel.cpp b/mobile/src/operators/kernel/arm/crf_kernel.cpp deleted file mode 100644 index d30c28b357..0000000000 --- a/mobile/src/operators/kernel/arm/crf_kernel.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CRF_OP - -#include "operators/kernel/crf_kernel.h" -#include "common/types.h" -#include "operators/kernel/central-arm-func/crf_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool CrfKernel::Init(CrfParam *param) { - return true; -} - -template <> -void CrfKernel::Compute(const CrfParam ¶m) { - CrfCompute(param); -} - -template class CrfKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp b/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp deleted file mode 100644 index 8aff3984e8..0000000000 --- a/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DENSITY_PRIORBOX_OP - -#include "operators/kernel/central-arm-func/density_prior_box_arm_func.h" -#include "operators/kernel/prior_box_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DensityPriorBoxKernel::Init(DensityPriorBoxParam *param) { - return true; -} - -template <> -void DensityPriorBoxKernel::Compute( - const DensityPriorBoxParam ¶m) { - DensityPriorBoxCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // DENSITY_PRIORBOX_OP diff --git a/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp b/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp deleted file mode 100644 index 4fa00f3a37..0000000000 --- a/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "operators/kernel/dequant_bn_kernel.h" -#include "operators/math/activation.h" -#include "operators/math/quantize.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif - -namespace paddle_mobile { -namespace operators { - -#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \ - defined(FUSION_DEQUANT_BN_RELU_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP) -void PublicFusionDequantBNInitParam(FusionDequantBNParam *param, - const framework::Tensor *bias) { - // batch norm params - const Tensor *bn_mean = param->bn_mean_; - const Tensor *bn_variance = param->bn_variance_; - Tensor *bn_scale = param->bn_scale_; - Tensor *bn_bias = param->bn_bias_; - const float epsilon = param->epsilon_; - - const float *mean_ptr = bn_mean->data(); - const float *var_ptr = bn_variance->data(); - float *bn_scale_ptr = bn_scale->mutable_data(); - float *bn_bias_ptr = bn_bias->mutable_data(); - for (int c = 0; c < bn_scale->numel(); ++c) { - float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon)); - float val = bias ? bias->data()[c] : 0; - bn_bias_ptr[c] = - inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c]; - bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c]; - } -} -#endif - -#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \ - defined(FUSION_DEQUANT_BN_RELU_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_RELU_OP) -template -void DequantBNCompute(const FusionDequantBNParam *param) { - const int32_t *input = param->input_->data(); - const float *bn_scale = param->bn_scale_->data(); - const float *bn_bias = param->bn_bias_->data(); - // dequantize params - const float activation_scale = param->activation_scale_->data()[0]; - const float weight_scale = param->weight_scale_; - const float dequant_scale = activation_scale / weight_scale; - - float *output = param->output_->mutable_data(); - int batch_size = param->input_->dims()[0]; - int channels = param->input_->dims()[1]; - size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3]; - - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < batch_size; ++batch) { - for (int c = 0; c < channels; ++c) { - // not fuse bn and dequant scale to minimize precision difference - // float scale = bn_scale[c] * dequant_scale; - float scale = bn_scale[c]; - float bias = bn_bias[c]; - size_t offset = (batch * channels + c) * spatial_size; - const int32_t *x = input + offset; - float *y = output + offset; - size_t remain = spatial_size; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - int loop = spatial_size >> 4; - remain = spatial_size & 0xF; - float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale); - float32x4_t __scale = vdupq_n_f32(scale); - float32x4_t __bias = vdupq_n_f32(bias); - for (int k = 0; k < loop; ++k, x += 16, y += 16) { - int32x4_t r0 = vld1q_s32(x); - int32x4_t r1 = vld1q_s32(x + 4); - int32x4_t r2 = vld1q_s32(x + 8); - int32x4_t r3 = vld1q_s32(x + 12); - float32x4_t f0 = vcvtq_f32_s32(r0); - float32x4_t f1 = vcvtq_f32_s32(r1); - float32x4_t f2 = vcvtq_f32_s32(r2); - float32x4_t f3 = vcvtq_f32_s32(r3); - f0 = vmulq_f32(__dequant_scale, f0); - f1 = vmulq_f32(__dequant_scale, f1); - f2 = vmulq_f32(__dequant_scale, f2); - f3 = vmulq_f32(__dequant_scale, f3); - f0 = vmlaq_f32(__bias, __scale, f0); - f1 = vmlaq_f32(__bias, __scale, f1); - f2 = vmlaq_f32(__bias, __scale, f2); - f3 = vmlaq_f32(__bias, __scale, f3); - f0 = math::vActiveq_f32(f0); - f1 = math::vActiveq_f32(f1); - f2 = math::vActiveq_f32(f2); - f3 = math::vActiveq_f32(f3); - vst1q_f32(y, f0); - vst1q_f32(y + 4, f1); - vst1q_f32(y + 8, f2); - vst1q_f32(y + 12, f3); - } -#endif // __ARM_NEON__ - for (int k = 0; k < remain; ++k) { - y[k] = math::Active(scale * (dequant_scale * x[k]) + bias); - } - } - } -} -#endif - -#ifdef FUSION_DEQUANT_BN_OP -template <> -bool FusionDequantBNKernel::Init(FusionDequantBNParam *param) { - PublicFusionDequantBNInitParam(param, nullptr); - return true; -} - -template <> -void FusionDequantBNKernel::Compute( - const FusionDequantBNParam ¶m) { - DequantBNCompute(¶m); -} -#endif // FUSION_DEQUANT_BN_OP - -#ifdef FUSION_DEQUANT_BN_RELU_OP -template <> -bool FusionDequantBNReluKernel::Init( - FusionDequantBNParam *param) { - PublicFusionDequantBNInitParam(param, nullptr); - return true; -} - -template <> -void FusionDequantBNReluKernel::Compute( - const FusionDequantBNParam ¶m) { - DequantBNCompute(¶m); -} -#endif // FUSION_DEQUANT_BN_RELU_OP - -#ifdef FUSION_DEQUANT_ADD_BN_OP -template <> -bool FusionDequantAddBNKernel::Init( - FusionDequantAddBNParam *param) { - const framework::Tensor *bias = param->bias_; - PublicFusionDequantBNInitParam(param, bias); - return true; -} - -template <> -void FusionDequantAddBNKernel::Compute( - const FusionDequantAddBNParam ¶m) { - DequantBNCompute(¶m); -} -#endif // FUSION_DEQUANT_ADD_BN_OP - -#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP -template <> -bool FusionDequantAddBNReluKernel::Init( - FusionDequantAddBNParam *param) { - const framework::Tensor *bias = param->bias_; - PublicFusionDequantBNInitParam(param, bias); - return true; -} - -template <> -void FusionDequantAddBNReluKernel::Compute( - const FusionDequantAddBNParam ¶m) { - DequantBNCompute(¶m); -} -#endif // FUSION_DEQUANT_ADD_BN_RELU_OP - -#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP) -template -void DequantBNQuantCompute(const FusionDequantAddBNQuantParam *param) { - const int32_t *input = param->input_->data(); - const float *bn_scale = param->bn_scale_->data(); - const float *bn_bias = param->bn_bias_->data(); - // dequantize params - const float activation_scale = param->activation_scale_->data()[0]; - const float weight_scale = param->weight_scale_; - const float dequant_scale = activation_scale / weight_scale; - // quantize params - Tensor *output_scale = param->online_scale_; - float max_abs = 0.f; - - int8_t *output = param->output_->mutable_data(); - int batch_size = param->input_->dims()[0]; - int channels = param->input_->dims()[1]; - size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3]; - - // if (param->is_static_) { - if (true) { - max_abs = param->static_scale_; - float quant_scale = 127.f / max_abs; - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < batch_size; ++batch) { - for (int c = 0; c < channels; ++c) { - // not fuse bn and dequant scale to minimize precision difference - // float scale = bn_scale[c] * dequant_scale; - float scale = bn_scale[c]; - float bias = bn_bias[c]; - size_t offset = (batch * channels + c) * spatial_size; - const int32_t *x = input + offset; - int8_t *y = output + offset; - size_t remain = spatial_size; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - int loop = spatial_size >> 4; - remain = spatial_size & 0xF; - float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale); - float32x4_t __scale = vdupq_n_f32(scale); - float32x4_t __bias = vdupq_n_f32(bias); - float32x4_t __quant_scale = vdupq_n_f32(quant_scale); - for (int k = 0; k < loop; ++k, x += 16, y += 16) { - int32x4_t r0 = vld1q_s32(x); - int32x4_t r1 = vld1q_s32(x + 4); - int32x4_t r2 = vld1q_s32(x + 8); - int32x4_t r3 = vld1q_s32(x + 12); - float32x4_t f0 = vcvtq_f32_s32(r0); - float32x4_t f1 = vcvtq_f32_s32(r1); - float32x4_t f2 = vcvtq_f32_s32(r2); - float32x4_t f3 = vcvtq_f32_s32(r3); - f0 = vmulq_f32(__dequant_scale, f0); - f1 = vmulq_f32(__dequant_scale, f1); - f2 = vmulq_f32(__dequant_scale, f2); - f3 = vmulq_f32(__dequant_scale, f3); - f0 = vmlaq_f32(__bias, __scale, f0); - f1 = vmlaq_f32(__bias, __scale, f1); - f2 = vmlaq_f32(__bias, __scale, f2); - f3 = vmlaq_f32(__bias, __scale, f3); - f0 = math::vActiveq_f32(f0); - f1 = math::vActiveq_f32(f1); - f2 = math::vActiveq_f32(f2); - f3 = math::vActiveq_f32(f3); - f0 = vmulq_f32(__quant_scale, f0); - f1 = vmulq_f32(__quant_scale, f1); - f2 = vmulq_f32(__quant_scale, f2); - f3 = vmulq_f32(__quant_scale, f3); - int32x4_t q0 = math::vRoundq_f32(f0); - int32x4_t q1 = math::vRoundq_f32(f1); - int32x4_t q2 = math::vRoundq_f32(f2); - int32x4_t q3 = math::vRoundq_f32(f3); - int16x4_t d0 = vmovn_s32(q0); - int16x4_t d1 = vmovn_s32(q1); - int16x4_t d2 = vmovn_s32(q2); - int16x4_t d3 = vmovn_s32(q3); - int16x8_t q5 = vcombine_s16(d0, d1); - int16x8_t q6 = vcombine_s16(d2, d3); - int8x8_t d5 = vmovn_s16(q5); - int8x8_t d6 = vmovn_s16(q6); - vst1_s8(y, d5); - vst1_s8(y + 8, d6); - } -#endif // __ARM_NEON__ - for (int k = 0; k < remain; ++k) { - float x_temp = - math::Active(scale * (dequant_scale * x[k]) + bias); - y[k] = math::Round(x_temp * quant_scale); - } - } - } - } else { - // TODO(hjchen2) - max_abs = std::max(max_abs, 1e-6f); - } - param->online_scale_->mutable_data()[0] = max_abs; -} - -template <> -bool FusionDequantAddBNQuantKernel::Init( - FusionDequantAddBNQuantParam *param) { - const framework::Tensor *bias = param->bias_; - PublicFusionDequantBNInitParam(param, bias); - return true; -} - -template <> -void FusionDequantAddBNQuantKernel::Compute( - const FusionDequantAddBNQuantParam ¶m) { - switch (param.round_type_) { - case ROUND_NEAREST_TO_EVEN: - DequantBNQuantCompute(¶m); - break; - case ROUND_NEAREST_TOWARDS_ZERO: - DequantBNQuantCompute(¶m); - break; - case ROUND_NEAREST_AWAY_ZERO: - DequantBNQuantCompute(¶m); - break; - default: - LOG(kLOG_ERROR) << "round type is not supported."; - break; - } -} -#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP - -#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP -template <> -bool FusionDequantAddBNReluQuantKernel::Init( - FusionDequantAddBNQuantParam *param) { - const framework::Tensor *bias = param->bias_; - PublicFusionDequantBNInitParam(param, bias); - return true; -} - -template <> -void FusionDequantAddBNReluQuantKernel::Compute( - const FusionDequantAddBNQuantParam ¶m) { - switch (param.round_type_) { - case ROUND_NEAREST_TO_EVEN: - DequantBNQuantCompute(¶m); - break; - case ROUND_NEAREST_TOWARDS_ZERO: - DequantBNQuantCompute(¶m); - break; - case ROUND_NEAREST_AWAY_ZERO: - DequantBNQuantCompute(¶m); - break; - default: - LOG(kLOG_ERROR) << "round type is not supported."; - break; - } -} -#endif // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/dequantize_kernel.cpp b/mobile/src/operators/kernel/arm/dequantize_kernel.cpp deleted file mode 100644 index 7c0d1cea18..0000000000 --- a/mobile/src/operators/kernel/arm/dequantize_kernel.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DEQUANT_OP - -#include "operators/kernel/dequantize_kernel.h" - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif - -namespace paddle_mobile { -namespace operators { - -template <> -bool DequantizeKernel::Init(DequantizeParam *param) { - return true; -} - -template <> -void DequantizeKernel::Compute(const DequantizeParam ¶m) { - const LoDTensor *input = param.input_; - LoDTensor *output = param.output_; - float activation_scale = param.activation_scale_->data()[0]; - float weight_scale = param.weight_scale_; - const int32_t *x = input->data(); - float *y = output->mutable_data(); - size_t size = output->numel(); - // float scale = 1.f / (activation_scale * weight_scale); - float scale = activation_scale / weight_scale; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = size >> 4; - size_t remain = size & 0xF; - float32x4_t s = vdupq_n_f32(scale); - - #pragma omp parallel for - for (size_t i = 0; i < loop; ++i) { - const int32_t *local_x = x + (i << 4); - float *local_y = y + (i << 4); - int32x4_t r0 = vld1q_s32(local_x); - int32x4_t r1 = vld1q_s32(local_x + 4); - int32x4_t r2 = vld1q_s32(local_x + 8); - int32x4_t r3 = vld1q_s32(local_x + 12); - float32x4_t f0 = vcvtq_f32_s32(r0); - float32x4_t f1 = vcvtq_f32_s32(r1); - float32x4_t f2 = vcvtq_f32_s32(r2); - float32x4_t f3 = vcvtq_f32_s32(r3); - f0 = vmulq_f32(f0, s); - f1 = vmulq_f32(f1, s); - f2 = vmulq_f32(f2, s); - f3 = vmulq_f32(f3, s); - vst1q_f32(local_y, f0); - vst1q_f32(local_y + 4, f1); - vst1q_f32(local_y + 8, f2); - vst1q_f32(local_y + 12, f3); - } - size = remain; - x += (loop << 4); - y += (loop << 4); -#endif - for (size_t i = 0; i < size; ++i) { - y[i] = x[i] * scale; - } - output->set_lod(input->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/dropout_kernel.cpp b/mobile/src/operators/kernel/arm/dropout_kernel.cpp deleted file mode 100644 index 964773ad69..0000000000 --- a/mobile/src/operators/kernel/arm/dropout_kernel.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DROPOUT_OP - -#include "operators/kernel/dropout_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool DropoutKernel::Init(DropoutParam *para) { - return true; -} - -template -struct DropoutFunctor { - explicit DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {} - inline T operator()(T in) const { return (1 - dropout_pro_) * in; } - - private: - T dropout_pro_; -}; - -template <> -void DropoutKernel::Compute(const DropoutParam ¶m) { - const auto *input_x = param.InputX(); - auto *input_x_ptr = input_x->data(); - auto *out = param.Out(); - auto *out_ptr = out->mutable_data(); - const float dropoutProb = param.DropoutProb(); - DropoutFunctor func_(dropoutProb); - math::Transform trans; - trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp deleted file mode 100644 index c4bcbf6f7e..0000000000 --- a/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEADD_OP - -#include "operators/kernel/elementwise_add_kernel.h" -#include "operators/kernel/central-arm-func/elementwise_add_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { - return true; -} - -template <> -void ElementwiseAddKernel::Compute( - const ElementwiseAddParam ¶m) { - if (param.InputX()->type() == type_id().hash_code()) { - ElementwiseAddCompute(param); - } else if (param.InputX()->type() == type_id().hash_code()) { - AddElememtWiseStruct()(param.InputX(), param.InputY(), - param.Axis(), param.Out()); - } - param.Out()->set_lod(param.InputX()->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp deleted file mode 100644 index 9c245707da..0000000000 --- a/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEMUL_OP - -#include "operators/kernel/elementwise_mul_kernel.h" -#include "operators/kernel/central-arm-func/elementwise_mul_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { - return true; -} - -template <> -void ElementwiseMulKernel::Compute( - const ElementwiseMulParam ¶m) { - ElementwiseMulCompute(param); - param.Out()->set_lod(param.InputX()->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp deleted file mode 100644 index 30f607155c..0000000000 --- a/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISESUB_OP - -#include "operators/kernel/elementwise_sub_kernel.h" -#include "operators/kernel/central-arm-func/elementwise_sub_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseSubKernel::Init(ElementwiseSubParam *param) { - return true; -} - -template <> -void ElementwiseSubKernel::Compute( - const ElementwiseSubParam ¶m) { - ElementwiseSubCompute(param); - param.Out()->set_lod(param.InputX()->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/exp_kernel.cpp b/mobile/src/operators/kernel/arm/exp_kernel.cpp deleted file mode 100644 index 0323a2b045..0000000000 --- a/mobile/src/operators/kernel/arm/exp_kernel.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// -// Created by hujie09 on 2019-07-16. -// - -#ifdef EXP_OP -#pragma once - -#include -#include -namespace paddle_mobile { -namespace operators { -template <> -bool EXPKernel::Init( - paddle_mobile::operators::EXPParam *param) { - return true; -} - -template <> -void EXPKernel::Compute( - const paddle_mobile::operators::EXPParam ¶m) { - const auto input_ = param.InputX(); - auto output = param.Out(); - float *output_data = output->mutable_data(); - const float *input_data = input_->data(); - for (int i = 0; i < output->numel(); ++i, output_data++, input_data++) { - *output_data = exp(*input_data); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // EXP_OP diff --git a/mobile/src/operators/kernel/arm/feed_kernel.cpp b/mobile/src/operators/kernel/arm/feed_kernel.cpp deleted file mode 100644 index 26ea2ac5f7..0000000000 --- a/mobile/src/operators/kernel/arm/feed_kernel.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/feed_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FeedKernel::Init(FeedParam *param) { - return true; -} - -template <> -void FeedKernel::Compute(const FeedParam ¶m) { - int col = param.Col(); - param.Out()->ShareDataWith(param.InputX()->at(col)); - param.Out()->set_lod(param.InputX()->at(col).lod()); -} - -template class FeedKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/fetch_kernel.cpp b/mobile/src/operators/kernel/arm/fetch_kernel.cpp deleted file mode 100644 index 8a97fa934b..0000000000 --- a/mobile/src/operators/kernel/arm/fetch_kernel.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/fetch_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FetchKernel::Init(FetchParam *param) { - return true; -} - -template <> -void FetchKernel::Compute(const FetchParam ¶m) { - int col = param.Col(); - param.Out()->at(col).ShareDataWith(*(param.InputX())); -} - -template class FetchKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/flatten_kernel.cpp b/mobile/src/operators/kernel/arm/flatten_kernel.cpp deleted file mode 100644 index 4d00e49454..0000000000 --- a/mobile/src/operators/kernel/arm/flatten_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FLATTEN_OP - -#include "operators/kernel/flatten_kernel.h" -#include "operators/kernel/central-arm-func/flatten_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FlattenKernel::Init(FlattenParam *param) { - return true; -} - -template <> -void FlattenKernel::Compute(const FlattenParam ¶m) { - FlattenCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp deleted file mode 100644 index 54ad5f788b..0000000000 --- a/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_FC_OP - -#include "operators/kernel/fusion_fc_kernel.h" -#include "operators/kernel/central-arm-func/fusion_fc_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FusionFcKernel::Init(FusionFcParam *param) { - int M = (int)param->InputX()->dims()[0]; - if (M == 1) { - int r = param->InputY()->dims()[0]; - int c = param->InputY()->dims()[1]; - float *B = param->InputY()->data(); - framework::Tensor matrix_trans; - float *trans_b = matrix_trans.mutable_data({r, c}); - int index = 0; - for (int j = 0; j < c; j++) { - for (int i = 0; i < r; i++) { - trans_b[index++] = B[i * c + j]; - } - } - index = 0; - for (int j = 0; j < c; j++) { - for (int i = 0; i < r; i++) { - B[index] = trans_b[index]; - index++; - } - } - } - return true; -} - -template <> -void FusionFcKernel::Compute(const FusionFcParam ¶m) { - FusionFcCompute(param); - param.Out()->set_lod(param.InputX()->lod()); -} - -template class FusionFcKernel; - -#ifdef FUSION_FC_INT8_OP -template <> -bool FusionFcKernel::Init(FusionFcParam *param) { - return true; -} - -template <> -void FusionFcKernel::Compute(const FusionFcParam ¶m) { - FusionFcCompute(param); - param.Out()->set_lod(param.InputX()->lod()); -} - -template class FusionFcKernel; -#endif - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/gru_kernel.cpp b/mobile/src/operators/kernel/arm/gru_kernel.cpp deleted file mode 100644 index 15459c8251..0000000000 --- a/mobile/src/operators/kernel/arm/gru_kernel.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_OP - -#include "operators/kernel/gru_kernel.h" -#include "operators/kernel/central-arm-func/gru_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool GruKernel::Init(GruParam *param) { - return true; -} - -template <> -void GruKernel::Compute(const GruParam ¶m) { - GruCompute(param); - param.OutHidden()->set_lod(param.InputInput()->lod()); -} - -template class GruKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp b/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp deleted file mode 100644 index bf20f25d72..0000000000 --- a/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_UNIT_OP - -#include "operators/kernel/gru_unit_kernel.h" -#include "operators/kernel/central-arm-func/gru_unit_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool GruUnitKernel::Init(GruUnitParam *param) { - return true; -} - -template <> -void GruUnitKernel::Compute(const GruUnitParam ¶m) { - GruUnitCompute(param); -} - -template class GruUnitKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp b/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp deleted file mode 100644 index 07ce0314fa..0000000000 --- a/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef IM2SEQUENCE_OP - -#include "operators/kernel/im2sequence_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Im2SequenceKernel::Init(Im2SequenceParam *para) { - return true; -} - -inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0, - int padding_1, int stride) { - const int output_size = - (input_size + padding_0 + padding_1 - filter_size) / stride + 1; - return output_size; -} - -template <> -void Im2SequenceKernel::Compute( - const Im2SequenceParam ¶m) { - const Tensor *in_x = param.Input(); - framework::LoDTensor *out = param.Output(); - out->mutable_data(); - - std::vector kernels = param.Kernels(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - - auto in_x_dim = in_x->dims(); - const int batch_size = static_cast(in_x_dim[0]); - const int img_channels = static_cast(in_x_dim[1]); - const int img_height = static_cast(in_x_dim[2]); - const int img_width = static_cast(in_x_dim[3]); - - int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0], - paddings[2], strides[0]); - int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], - paddings[3], strides[1]); - - out->mutable_data({batch_size * output_height * output_width, - img_channels * kernels[0] * kernels[1]}); - const std::vector dilations({1, 1}); - // TODO(): verify - auto out_dims = out->dims(); - out->Resize({batch_size, out->numel() / batch_size}); - for (int i = 0; i < batch_size; i++) { - const Tensor src = - in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); - Tensor dst = out->Slice(i, i + 1).Resize( - {output_height, output_width, img_channels, kernels[0], kernels[1]}); - math::Im2ColFunctor f; - f(src, dilations, strides, paddings, &dst); - } - out->Resize(out_dims); - framework::LoD lod(1); - lod[0].reserve(batch_size + 1); - int offset = 0; - lod[0].push_back(offset); - for (int i = 0; i < batch_size; ++i) { - offset += output_height * output_width; - lod[0].push_back(offset); - } - out->set_lod(lod); -} - -template class Im2SequenceKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/increment_kernel.cpp b/mobile/src/operators/kernel/arm/increment_kernel.cpp deleted file mode 100644 index 27fd48d084..0000000000 --- a/mobile/src/operators/kernel/arm/increment_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INCREMENT_OP - -#include "operators/kernel/increment_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool IncrementKernel::Init(IncrementParam *param) { - return true; -} - -template <> -void IncrementKernel::Compute(const IncrementParam ¶m) { - IncrementCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/is_empty_kernel.cpp b/mobile/src/operators/kernel/arm/is_empty_kernel.cpp deleted file mode 100644 index 070d3d16d7..0000000000 --- a/mobile/src/operators/kernel/arm/is_empty_kernel.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INCREMENT_OP - -#include "operators/kernel/is_empty_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool IsEmptyKernel::Init(IsEmptyParam *param) { - return true; -} - -template <> -void IsEmptyKernel::Compute(const IsEmptyParam ¶m) { - const framework::Tensor *input = param.InputX(); - framework::Tensor *out = param.Out(); - out->mutable_data()[0] = input->numel() == 0; -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp b/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp deleted file mode 100644 index 264611be01..0000000000 --- a/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LOD_RESET_OP - -#include -#include "operators/kernel/kernels.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool LodResetKernel::Init(LodResetParam *param) { - return true; -} - -template <> -void LodResetKernel::Compute(const LodResetParam ¶m) { - const auto *input = param.input_x_; - const auto *lod_t = param.input_y_; - bool append = param.append; - auto *output = param.output_; - - output->ShareDataWith(*input); - - std::vector level0; - if (lod_t) { - if (lod_t->lod().size() > 0) { - output->set_lod(lod_t->lod()); - return; // early return, since lod already set - } else { - auto *lod = lod_t->data(); - level0 = std::vector(lod, lod + lod_t->numel()); - } - } else { - level0 = param.target_lod_; - } - - // cast level0 to size_t - std::vector ulevel0(level0.size(), 0); - std::transform(level0.begin(), level0.end(), ulevel0.begin(), - [](int a) { return static_cast(a); }); - - if (append) { - auto *out_lod = output->mutable_lod(); - out_lod->push_back(ulevel0); - } else { - framework::LoD target_lod; - target_lod.push_back(ulevel0); - output->set_lod(target_lod); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // LOD_RESET_OP diff --git a/mobile/src/operators/kernel/arm/logical_kernel.cpp b/mobile/src/operators/kernel/arm/logical_kernel.cpp deleted file mode 100644 index 3cffcf5c69..0000000000 --- a/mobile/src/operators/kernel/arm/logical_kernel.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/logical_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template -struct LogicalAndFunctor { - bool operator()(const T& a, const T& b) const { return a && b; } -}; - -template -struct LogicalOrFunctor { - bool operator()(const T& a, const T& b) const { return a || b; } -}; - -template -struct LogicalNotFunctor { - bool operator()(const T& a) const { return !a; } -}; - -template -struct LogicalXorFunctor { - bool operator()(const T& a, const T& b) const { - return (a || b) && !(a && b); - } -}; - -template -void UnaryLogicalCompute(const Tensor* inputX, Tensor* output) { - Functor func; - std::transform(inputX->data(), inputX->data() + inputX->numel(), - output->data(), func); -} - -template -void BinaryLogicalCompute(const Tensor* inputX, const Tensor* inputY, - Tensor* output) { - Functor func; - std::transform(inputX->data(), inputX->data() + inputX->numel(), - inputY->data(), output->data(), func); -} - -#ifdef LOGICAL_AND_OP -template <> -bool LogicalAndKernel::Init(LogicalBinaryParam* param) { - return true; -} - -template <> -void LogicalAndKernel::Compute( - const LogicalBinaryParam& param) { - auto* inputX = param.InputX(); - auto* inputY = param.InputY(); - auto* out = param.Out(); - out->mutable_data(); - BinaryLogicalCompute>(inputX, inputY, out); -} -#endif - -#ifdef LOGICAL_OR_OP -template <> -bool LogicalOrKernel::Init(LogicalBinaryParam* param) { - return true; -} - -template <> -void LogicalOrKernel::Compute( - const LogicalBinaryParam& param) { - auto* inputX = param.InputX(); - auto* inputY = param.InputY(); - auto* out = param.Out(); - out->mutable_data(); - BinaryLogicalCompute>(inputX, inputY, out); -} -#endif - -#ifdef LOGICAL_NOT_OP -template <> -bool LogicalNotKernel::Init(LogicalUnaryParam* param) { - return true; -} - -template <> -void LogicalNotKernel::Compute( - const LogicalUnaryParam& param) { - auto* inputX = param.InputX(); - auto* out = param.Out(); - out->mutable_data(); - UnaryLogicalCompute>(inputX, out); -} -#endif - -#ifdef LOGICAL_XOR_OP -template <> -bool LogicalXorKernel::Init(LogicalBinaryParam* param) { - return true; -} - -template <> -void LogicalXorKernel::Compute( - const LogicalBinaryParam& param) { - auto* inputX = param.InputX(); - auto* inputY = param.InputY(); - auto* out = param.Out(); - out->mutable_data(); - BinaryLogicalCompute>(inputX, inputY, out); -} -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/lookup_kernel.cpp b/mobile/src/operators/kernel/arm/lookup_kernel.cpp deleted file mode 100644 index 0e6df6ab6b..0000000000 --- a/mobile/src/operators/kernel/arm/lookup_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef LOOKUP_OP - -#include "operators/kernel/lookup_kernel.h" -#include "operators/kernel/central-arm-func/lookup_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool LookupKernel::Init(LookupParam *param) { - return true; -} - -template <> -void LookupKernel::Compute(const LookupParam ¶m) { - LookupCompute(param); - param.Out()->set_lod(param.InputIds()->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/lrn_kernel.cpp b/mobile/src/operators/kernel/arm/lrn_kernel.cpp deleted file mode 100644 index bf049053fc..0000000000 --- a/mobile/src/operators/kernel/arm/lrn_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LRN_OP - -#include "operators/kernel/lrn_kernel.h" -#include "operators/kernel/central-arm-func/lrn_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool LrnKernel::Init(LrnParam *param) { - return true; -} - -template <> -void LrnKernel::Compute(const LrnParam ¶m) { - LrnCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/mul_kernel.cpp b/mobile/src/operators/kernel/arm/mul_kernel.cpp deleted file mode 100644 index 59d16600d7..0000000000 --- a/mobile/src/operators/kernel/arm/mul_kernel.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MUL_OP - -#include "operators/kernel/mul_kernel.h" -#include "operators/kernel/central-arm-func/mul_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool MulKernel::Init(MulParam *param) { - return true; -} - -template <> -void MulKernel::Compute(const MulParam ¶m) { - MulCompute(param); - param.Out()->set_lod(param.InputX()->lod()); -} - -template class MulKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp deleted file mode 100644 index 61638da005..0000000000 --- a/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP - -#include "operators/kernel/multiclass_nms_kernel.h" -#include "operators/kernel/central-arm-func/multiclass_nms_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool MultiClassNMSKernel::Init(MultiClassNMSParam *param) { - return true; -} - -template <> -void MultiClassNMSKernel::Compute( - const MultiClassNMSParam ¶m) { - MultiClassNMSCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp b/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp deleted file mode 100644 index d412ec1a5d..0000000000 --- a/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NEAREST_INTERP_OP - -#include "operators/kernel/nearest_interp_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool NearestInterpolationKernel::Init( - NearestInterpolationParam* param) { - return true; -} - -template <> -void NearestInterpolationKernel::Compute( - const NearestInterpolationParam& param) { - auto out_dims = param.Out()->dims(); - auto* input = param.InputX()->data(); - auto out_size_t = param.InputOutPutSize(); - - int out_h = param.OutH(); - int out_w = param.OutW(); - if (out_size_t != nullptr) { - auto out_size_data = out_size_t->data(); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - auto* output = param.Out()->mutable_data( - {out_dims[0], out_dims[1], out_h, out_w}); - auto batch_size = param.InputX()->dims()[0]; - auto channels = param.InputX()->dims()[1]; - auto in_h = param.InputX()->dims()[2]; - auto in_w = param.InputX()->dims()[3]; - - auto in_hw = in_h * in_w; - auto out_hw = out_h * out_w; - auto in_chw = channels * in_hw; - auto out_chw = channels * out_hw; - - float ratio_h = - (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - float ratio_w = - (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; - - if (in_h == out_h && in_w == out_w) { - memcpy(output, input, param.InputX()->numel() * sizeof(float)); - } else { - for (int k = 0; k < batch_size; ++k) { // loop for batches - for (int i = 0; i < out_h; ++i) { // loop for images - int h = ratio_h * i + 0.5f; - - for (int j = 0; j < out_w; ++j) { - int w = ratio_w * j + 0.5f; - - // calculate four position for bilinear interpolation - const float* in_pos = &input[k * in_chw + h * in_w + w]; - float* out_pos = &output[k * out_chw + i * out_w + j]; - - for (int c = 0; c < channels; ++c) { // loop for channels - // nearest interpolation - out_pos[0] = in_pos[0]; - in_pos += in_hw; - out_pos += out_hw; - } - } - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/norm_kernel.cpp b/mobile/src/operators/kernel/arm/norm_kernel.cpp deleted file mode 100644 index 32617992cb..0000000000 --- a/mobile/src/operators/kernel/arm/norm_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NORM_OP - -#include "operators/kernel/norm_kernel.h" -#include "operators/kernel/central-arm-func/norm_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool NormKernel::Init(NormParam *param) { - return true; -} - -template <> -void NormKernel::Compute(const NormParam ¶m) { - NormCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/one_hot_kernel.cpp b/mobile/src/operators/kernel/arm/one_hot_kernel.cpp deleted file mode 100644 index 208b34ea2c..0000000000 --- a/mobile/src/operators/kernel/arm/one_hot_kernel.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ONE_HOT_OP - -#include "operators/kernel/one_hot_kernel.h" -#include "framework/data_type.h" - -namespace paddle_mobile { -namespace operators { - -template -struct OnehotOpFunctor { - const framework::LoDTensor* in_; - framework::LoDTensor* out_; - int depth_; - - OnehotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, - int depth) - : in_(in), out_(out), depth_(depth) {} - - template - void apply() const { - auto* p_in_data = in_->data(); - auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(); - memset(p_out_data, 0, out_->numel() * sizeof(OutT)); - - for (int i = 0; i < numel; ++i) { - *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; - } - } -}; - -template <> -bool OnehotKernel::Init(OnehotParam* param) { - return true; -} - -template <> -void OnehotKernel::Compute(const OnehotParam& param) { - framework::VisitDataType( - framework::ToDataType(param.dtype_), - OnehotOpFunctor(param.input_, param.output_, param.depth_)); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ONE_HOT_OP diff --git a/mobile/src/operators/kernel/arm/pad2d_kernel.cpp b/mobile/src/operators/kernel/arm/pad2d_kernel.cpp deleted file mode 100755 index f71058519c..0000000000 --- a/mobile/src/operators/kernel/arm/pad2d_kernel.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PAD2D_OP - -#include "operators/kernel/pad2d_kernel.h" -#include "operators/math/pad.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Pad2DKernel::Init(Pad2DParam *param) { - return true; -} - -template <> -void Pad2DKernel::Compute(const Pad2DParam ¶m) { - const auto *input = param.InputX(); - auto *output = param.Out(); - const auto &paddings = param.paddings_; - // if (param.mode_ == "constant" && param.pad_value_ == 0) { - math::PadFunctor pad; - pad(*input, paddings[0], paddings[1], paddings[2], paddings[3], output); - // } else { - // PADDLE_MOBILE_THROW_EXCEPTION("Pad2D has not been implemented."); - // } - output->set_lod(input->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PAD2D_OP diff --git a/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp b/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp deleted file mode 100644 index 1ae11aba41..0000000000 --- a/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POLYGONBOXTRANSFORM_OP - -#include "operators/kernel/polygon_box_transform_kernel.h" -#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool PolygonBoxTransformKernel::Init( - PolygonBoxTransformParam *param) { - return true; -} - -template <> -void PolygonBoxTransformKernel::Compute( - const PolygonBoxTransformParam ¶m) { - PolygonBoxTransformCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/pool_kernel.cpp b/mobile/src/operators/kernel/arm/pool_kernel.cpp deleted file mode 100644 index 703a73d64b..0000000000 --- a/mobile/src/operators/kernel/arm/pool_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#include "operators/kernel/pool_kernel.h" -#include "operators/kernel/central-arm-func/pool_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool PoolKernel::Init(PoolParam *param) { - return true; -} - -template <> -void PoolKernel::Compute(const PoolParam ¶m) { - PoolCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // POOL_OP diff --git a/mobile/src/operators/kernel/arm/prelu_kernel.cpp b/mobile/src/operators/kernel/arm/prelu_kernel.cpp deleted file mode 100644 index 591bd64416..0000000000 --- a/mobile/src/operators/kernel/arm/prelu_kernel.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PRELU_OP - -#include "operators/kernel/prelu_kernel.h" -#include -#if __ARM_NEON -#include -#endif - -namespace paddle_mobile { -namespace operators { - -template -struct PReluFunctor { - explicit PReluFunctor(float slope) { this->slope_ = slope; } - inline T operator()(T in) const { return in > 0 ? in : in * slope_; } - - float slope_ = 0.0f; -}; - -/* - * @b 特化到具体平台的实现, param 从 op 层传入 - * */ -template <> -void PReluKernel::Compute(const PReluParam ¶m) { - auto *x = param.InputX(); - auto *alpha = param.InputAlpha(); - auto *out = param.Out(); - std::string mode = param.Mode(); - auto *x_ptr = x->data(); - auto *o_ptr = out->mutable_data(); - auto *alpha_ptr = alpha->data(); - int numel = x->numel(); - auto dim = x->dims(); - int k = dim[0] * dim[1]; - int n = dim[2] * dim[3]; - int index = 0; - int i = 0; - int temp = 0; -#if __ARM_NEON - #pragma omp parallel for - for (int i = 0; i < k; i++) { - float32x4_t zero = vdupq_n_f32(0.0); - float32x4_t cv; - float32x4_t cv1; - float32x4_t cv2; - float32x4_t pv; - for (int j = 0; (j + 3) < n; j += 4) { - const float *in = x_ptr + i * n + j; - float *out = o_ptr + i * n + j; - cv = vld1q_f32(in); - cv1 = vmaxq_f32(cv, zero); - cv2 = vminq_f32(cv, zero); - if (mode == "channel") { - cv2 = vmulq_n_f32(cv2, alpha_ptr[i]); - } else if (mode == "element") { - pv = vld1q_f32(alpha_ptr + i * n + j); - cv2 = vmulq_f32(cv2, pv); - } else { - cv2 = vmulq_n_f32(cv2, alpha_ptr[0]); - } - cv = vaddq_f32(cv1, cv2); - vst1q_f32(out, cv); - } - int j; - for (j = 0; (j + 3) < n; j += 4) { - } - for (int m = j; m < n; m++) { - if (mode == "channel") { - o_ptr[i * n + m] = x_ptr[i * n + m] > 0 - ? x_ptr[i * n + m] - : alpha_ptr[i] * x_ptr[i * n + m]; - } else if (mode == "element") { - o_ptr[i * n + m] = x_ptr[i * n + m] > 0 - ? x_ptr[i * n + m] - : alpha_ptr[i * n + m] * x_ptr[i * n + m]; - } else { - o_ptr[i * n + m] = x_ptr[i * n + m] > 0 - ? x_ptr[i * n + m] - : alpha_ptr[0] * x_ptr[i * n + m]; - } - } - } - -#else - if (mode == "channel") { - temp = numel / (dim[0] * dim[1]); -#pragma omp parallel for - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else if (mode == "element") { -#pragma omp parallel for - for (i = 0; i < numel; i++) { - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i]; - } - } else { -#pragma omp parallel for - for (i = 0; i < numel; i++) { - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; - } - } -#endif -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/prior_box_kernel.cpp b/mobile/src/operators/kernel/arm/prior_box_kernel.cpp deleted file mode 100644 index c067d3388d..0000000000 --- a/mobile/src/operators/kernel/arm/prior_box_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PRIORBOX_OP - -#include "operators/kernel/prior_box_kernel.h" -#include "operators/kernel/central-arm-func/prior_box_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool PriorBoxKernel::Init(PriorBoxParam *param) { - return true; -} - -template <> -void PriorBoxKernel::Compute(const PriorBoxParam ¶m) { - PriorBoxCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/proposal_kernel.cpp b/mobile/src/operators/kernel/arm/proposal_kernel.cpp deleted file mode 100644 index c9d0c18448..0000000000 --- a/mobile/src/operators/kernel/arm/proposal_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PROPOSAL_OP - -#include -#include "operators/kernel/detection_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ProposalKernel::Init(ProposalParam *param) { - return true; -} - -template <> -void ProposalKernel::Compute(const ProposalParam ¶m) { - // TODO(hjchen2) -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PROPOSAL_OP diff --git a/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp deleted file mode 100644 index 6ed4c77d2d..0000000000 --- a/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PSROI_POOL_OP - -#include -#include "operators/kernel/detection_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool PSRoiPoolKernel::Init(PSRoiPoolParam *param) { - return true; -} - -template <> -void PSRoiPoolKernel::Compute(const PSRoiPoolParam ¶m) { - // TODO(hjchen2) -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PSROI_POOL_OP diff --git a/mobile/src/operators/kernel/arm/quantize_kernel.cpp b/mobile/src/operators/kernel/arm/quantize_kernel.cpp deleted file mode 100644 index 515e9cf40d..0000000000 --- a/mobile/src/operators/kernel/arm/quantize_kernel.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef QUANT_OP - -#include "operators/kernel/quantize_kernel.h" -#include -#include "operators/math/quantize.h" - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif - -namespace paddle_mobile { -namespace operators { - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#ifndef __aarch64__ -inline float32_t vmaxvq_f32(float32x4_t r) { - float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r)); - return vget_lane_f32(vpmax_f32(v, v), 0); -} -#endif - -template -inline void QuantizeOffline(const Tensor *input, const float scale, - const float max_abs, Tensor *output) { - const float *x = input->data(); - int8_t *y = output->mutable_data(); - size_t remain = input->numel(); -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = remain >> 4; - remain = remain & 0xF; - float32x4_t __scale = vdupq_n_f32(scale); - float32x4_t __postive_max = vdupq_n_f32(max_abs); - float32x4_t __negtive_max = vdupq_n_f32(-max_abs); - #pragma omp parallel for - for (size_t i = 0; i < loop; ++i) { - const float *local_x = x + (i << 4); - int8_t *local_y = y + (i << 4); - float32x4_t r0 = vld1q_f32(local_x); - float32x4_t r1 = vld1q_f32(local_x + 4); - float32x4_t r2 = vld1q_f32(local_x + 8); - float32x4_t r3 = vld1q_f32(local_x + 12); - r0 = vmaxq_f32(vminq_f32(r0, __postive_max), __negtive_max); - r1 = vmaxq_f32(vminq_f32(r1, __postive_max), __negtive_max); - r2 = vmaxq_f32(vminq_f32(r2, __postive_max), __negtive_max); - r3 = vmaxq_f32(vminq_f32(r3, __postive_max), __negtive_max); - r0 = vmulq_f32(r0, __scale); - r1 = vmulq_f32(r1, __scale); - r2 = vmulq_f32(r2, __scale); - r3 = vmulq_f32(r3, __scale); - int32x4_t q0 = math::vRoundq_f32(r0); - int32x4_t q1 = math::vRoundq_f32(r1); - int32x4_t q2 = math::vRoundq_f32(r2); - int32x4_t q3 = math::vRoundq_f32(r3); - int16x4_t d0 = vmovn_s32(q0); - int16x4_t d1 = vmovn_s32(q1); - int16x4_t d2 = vmovn_s32(q2); - int16x4_t d3 = vmovn_s32(q3); - int16x8_t q5 = vcombine_s16(d0, d1); - int16x8_t q6 = vcombine_s16(d2, d3); - int8x8_t d5 = vmovn_s16(q5); - int8x8_t d6 = vmovn_s16(q6); - vst1_s8(local_y, d5); - vst1_s8(local_y + 8, d6); - } - x += (loop << 4); - y += (loop << 4); -#endif - for (size_t i = 0; i < remain; ++i) { - float x_temp = std::max(std::min(x[i], max_abs), -max_abs); - y[i] = math::Round(x_temp * scale); - } -} - -template -inline void QuantizeOnline(const Tensor *input, const float scale, - Tensor *output) { - const float *x = input->data(); - int8_t *y = output->mutable_data(); - size_t remain = input->numel(); -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = remain >> 4; - remain = remain & 0xF; - float32x4_t __scale = vdupq_n_f32(scale); - #pragma omp parallel for - for (size_t i = 0; i < loop; ++i) { - const float *local_x = x + (i << 4); - int8_t *local_y = y + (i << 4); - float32x4_t r0 = vld1q_f32(local_x); - float32x4_t r1 = vld1q_f32(local_x + 4); - float32x4_t r2 = vld1q_f32(local_x + 8); - float32x4_t r3 = vld1q_f32(local_x + 12); - r0 = vmulq_f32(r0, __scale); - r1 = vmulq_f32(r1, __scale); - r2 = vmulq_f32(r2, __scale); - r3 = vmulq_f32(r3, __scale); - int32x4_t q0 = math::vRoundq_f32(r0); - int32x4_t q1 = math::vRoundq_f32(r1); - int32x4_t q2 = math::vRoundq_f32(r2); - int32x4_t q3 = math::vRoundq_f32(r3); - int16x4_t d0 = vmovn_s32(q0); - int16x4_t d1 = vmovn_s32(q1); - int16x4_t d2 = vmovn_s32(q2); - int16x4_t d3 = vmovn_s32(q3); - int16x8_t q5 = vcombine_s16(d0, d1); - int16x8_t q6 = vcombine_s16(d2, d3); - int8x8_t d5 = vmovn_s16(q5); - int8x8_t d6 = vmovn_s16(q6); - vst1_s8(local_y, d5); - vst1_s8(local_y + 8, d6); - } - x += (loop << 4); - y += (loop << 4); -#endif - for (size_t i = 0; i < remain; ++i) { - y[i] = math::Round(x[i] * scale); - } -} - -template -static void Quantize(const Tensor *input, const float max_abs, - const bool offline, Tensor *output) { - float scale = 127.f / max_abs; - if (offline) { - QuantizeOffline(input, scale, max_abs, output); - } else { - QuantizeOnline(input, scale, output); - } -} - -float find_abs_max(const Tensor *input) { - float max_abs = 0.f; - const float *x = input->data(); - size_t remain = input->numel(); -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = remain >> 4; - remain = remain & 0xF; - float32x4_t __max = {0.f, 0.f, 0.f, 0.f}; - - for (size_t i = 0; i < loop; ++i, x += 16) { - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); - r0 = vabsq_f32(r0); - r1 = vabsq_f32(r1); - r2 = vabsq_f32(r2); - r3 = vabsq_f32(r3); - r0 = vmaxq_f32(r0, r1); - r1 = vmaxq_f32(r2, r3); - r0 = vmaxq_f32(r0, r1); - __max = vmaxq_f32(r0, __max); - } - max_abs = vmaxvq_f32(__max); -#endif - for (size_t i = 0; i < remain; ++i) { - max_abs = std::max(max_abs, static_cast(fabs(x[i]))); - } - return max_abs; -} - -} // namespace operators -} // namespace paddle_mobile -#endif // __ARM_NEON__ - -namespace paddle_mobile { -namespace operators { - -template <> -bool QuantizeKernel::Init(QuantizeParam *param) { - return true; -} - -template <> -void QuantizeKernel::Compute(const QuantizeParam ¶m) { - const LoDTensor *input = param.input_; - LoDTensor *output = param.output_; - Tensor *output_scale = param.online_scale_; - float max_abs = 0.f; - if (param.offline_) { - max_abs = param.offline_scale_->data()[0]; - } else { - max_abs = find_abs_max(input); - } - max_abs = std::max(max_abs, 1e-6f); - param.online_scale_->mutable_data()[0] = max_abs; - switch (param.round_type_) { - case ROUND_NEAREST_TO_EVEN: - Quantize(input, max_abs, param.offline_, output); - break; - case ROUND_NEAREST_TOWARDS_ZERO: - Quantize(input, max_abs, param.offline_, - output); - break; - case ROUND_NEAREST_AWAY_ZERO: - Quantize(input, max_abs, param.offline_, output); - break; - default: - LOG(kLOG_ERROR) << "round type is not supported."; - break; - } - output->set_lod(input->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // QUANT_OP diff --git a/mobile/src/operators/kernel/arm/reshape2_kernel.cpp b/mobile/src/operators/kernel/arm/reshape2_kernel.cpp deleted file mode 100644 index 093105f906..0000000000 --- a/mobile/src/operators/kernel/arm/reshape2_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE2_OP - -#include "operators/kernel/reshape2_kernel.h" -#include "operators/kernel/central-arm-func/reshape2_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Reshape2Kernel::Init(Reshape2Param *param) { - return true; -} - -template <> -void Reshape2Kernel::Compute(const Reshape2Param ¶m) { - Reshape2Compute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/reshape_kernel.cpp b/mobile/src/operators/kernel/arm/reshape_kernel.cpp deleted file mode 100644 index 800808f9c2..0000000000 --- a/mobile/src/operators/kernel/arm/reshape_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE_OP - -#include "operators/kernel/reshape_kernel.h" -#include "operators/kernel/central-arm-func/reshape_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ReshapeKernel::Init(ReshapeParam *param) { - return true; -} - -template <> -void ReshapeKernel::Compute(const ReshapeParam ¶m) { - ReshapeCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/resize_kernel.cpp b/mobile/src/operators/kernel/arm/resize_kernel.cpp deleted file mode 100644 index 6a6af36788..0000000000 --- a/mobile/src/operators/kernel/arm/resize_kernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESIZE_OP - -#include "operators/kernel/resize_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { -void BiLinearResizeTensor(const float* src, const int src_height, - const int src_width, float* dst, const int dst_height, - const int dst_width) { - const float scale_w = src_width / static_cast(dst_width); - const float scale_h = src_height / static_cast(dst_height); - float* dst_data = dst; - const float* src_data = src; - - for (int dst_h = 0; dst_h < dst_height; ++dst_h) { - float fh = dst_h * scale_h; - - int src_h = std::floor(fh); - - fh -= src_h; - const float w_h0 = fabs(1.0 - fh); - const float w_h1 = fabs(fh); - - const int dst_offset_1 = dst_h * dst_width; - const int src_offset_1 = src_h * src_width; - - float* dst_data_ptr = dst_data + dst_offset_1; - - for (int dst_w = 0; dst_w < dst_width; ++dst_w) { - float fw = dst_w * scale_w; - int src_w = std::floor(fw); - fw -= src_w; - const float w_w0 = fabs(1.0 - fw); - const float w_w1 = fabs(fw); - - float dst_value = 0; - - const int src_idx = src_offset_1 + src_w; - dst_value += (w_h0 * w_w0 * src_data[src_idx]); - int flag = 0; - if (src_w + 1 < src_width) { - dst_value += (w_h0 * w_w1 * src_data[src_idx + 1]); - ++flag; - } - if (src_h + 1 < src_height) { - dst_value += (w_h1 * w_w0 * src_data[src_idx + src_width]); - ++flag; - } - - if (flag > 1) { - dst_value += (w_h1 * w_w1 * src_data[src_idx + src_width + 1]); - // ++flag; - } - *(dst_data_ptr++) = dst_value; - } - } -} - -void ResizeTensor(const Tensor* src, const int src_n, const int src_c, - Tensor* dst, const int dst_n, const int dst_c) { - framework::DDim in_dims = src->dims(); - const int src_chans = in_dims[1]; - const int src_height = in_dims[2]; - const int src_width = in_dims[3]; - const int src_offset = (src_n * src_chans + src_c) * src_height * src_width; - - framework::DDim out_dims = dst->dims(); - const int dst_chans = out_dims[1]; - const int dst_height = out_dims[2]; - const int dst_width = out_dims[3]; - const int dst_offset = (dst_n * dst_chans + dst_c) * dst_height * dst_width; - - const auto* src_ptr = src->data(); - auto* dst_ptr = dst->data(); - const auto* src_data = &(src_ptr[src_offset]); - auto* dst_data = &(dst_ptr[dst_offset]); - BiLinearResizeTensor(src_data, src_height, src_width, dst_data, dst_height, - dst_width); -} - -void ResizeTensor(const Tensor* src, Tensor* dst) { - framework::DDim in_dims = src->dims(); - framework::DDim out_dims = dst->dims(); - PADDLE_MOBILE_ENFORCE(in_dims[0] == out_dims[0], - "src tensor batch num not equal to dst tensor"); - PADDLE_MOBILE_ENFORCE(in_dims[1] == out_dims[1], - "src tensor channel num not equal to dst tensor"); - for (int n = 0, batch_num = in_dims[0]; n < batch_num; ++n) { - for (int c = 0, chan_num = in_dims[1]; c < chan_num; ++c) { - ResizeTensor(src, n, c, dst, n, c); - } - } -} - -template <> -void ResizeKernel::Compute(const ResizeParam& param) { - const auto* input_x = param.InputX(); - const auto& input_x_dims = input_x->dims(); - auto* out = param.Out(); - framework::DDim out_dims = CalOutputShape(param); - - out->Resize(out_dims); - ResizeTensor(input_x, out); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp b/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp deleted file mode 100644 index c8b0cb8bf2..0000000000 --- a/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ROI_PERSPECTIVE_OP - -#include -#include "operators/kernel/detection_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template -inline bool GT_E(T a, T b) { - return (a > b) || fabs(a - b) < 1e-4; -} - -template -inline bool LT_E(T a, T b) { - return (a < b) || fabs(a - b) < 1e-4; -} - -// check if (x, y) is in the boundary of roi -template -bool in_quad(T x, T y, T roi_x[], T roi_y[]) { - for (int i = 0; i < 4; i++) { - T xs = roi_x[i]; - T ys = roi_y[i]; - T xe = roi_x[(i + 1) % 4]; - T ye = roi_y[(i + 1) % 4]; - if (fabs(ys - ye) < 1e-4) { - if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 && - GT_E(x, std::min(xs, xe)) && LT_E(x, std::max(xs, xe))) { - return true; - } - } else { - T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs; - if (fabs(intersec_x - x) < 1e-4 && GT_E(y, std::min(ys, ye)) && - LT_E(y, std::max(ys, ye))) { - return true; - } - } - } - - int n_cross = 0; - for (int i = 0; i < 4; i++) { - T xs = roi_x[i]; - T ys = roi_y[i]; - T xe = roi_x[(i + 1) % 4]; - T ye = roi_y[(i + 1) % 4]; - if (fabs(ys - ye) < 1e-4) { - continue; - } - if (LT_E(y, std::min(ys, ye)) || (y > std::max(ys, ye))) { - continue; - } - T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs; - if (fabs(intersec_x - x) < 1e-4) { - return true; - } - if (intersec_x > x) { - n_cross++; - } - } - return (n_cross % 2 == 1); -} - -template -void get_transform_matrix(const int transformed_width, - const int transformed_height, T roi_x[], T roi_y[], - T matrix[]) { - T x0 = roi_x[0]; - T x1 = roi_x[1]; - T x2 = roi_x[2]; - T x3 = roi_x[3]; - T y0 = roi_y[0]; - T y1 = roi_y[1]; - T y2 = roi_y[2]; - T y3 = roi_y[3]; - - // Estimate the height and width of RoI - T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); - T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)); - T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3)); - T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0)); - T estimated_height = (len2 + len4) / 2.0; - T estimated_width = (len1 + len3) / 2.0; - - // Get the normalized height and normalized width - int normalized_height = transformed_height; - int normalized_width = - std::round(estimated_width * (normalized_height - 1) / estimated_height) + - 1; - normalized_width = std::min(normalized_width, transformed_width); - - T dx1 = x1 - x2; - T dx2 = x3 - x2; - T dx3 = x0 - x1 + x2 - x3; - T dy1 = y1 - y2; - T dy2 = y3 - y2; - T dy3 = y0 - y1 + y2 - y3; - - matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / - (normalized_width - 1); - matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / - (normalized_height - 1); - matrix[8] = 1; - - matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) / - (normalized_width - 1); - matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) / - (normalized_height - 1); - matrix[5] = y0; - - matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) / - (normalized_width - 1); - matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) / - (normalized_height - 1); - matrix[2] = x0; -} - -// Get the source coordinates in the input feature map. -// (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix -// in_w = u / w -// in_h = v / w -template -void get_source_coords(T matrix[], int out_w, int out_h, T *in_w, T *in_h) { - T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]; - T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]; - T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]; - - in_w[0] = u / w; - in_h[0] = v / w; -} - -template -void bilinear_interpolate(const T *in_data, const int channels, const int width, - const int height, int in_n, int in_c, T in_w, T in_h, - T *val) { - // Deal with cases that source coords are out of feature map boundary - if ((-0.5 > in_w) || (in_w > width - 0.5) || (-0.5 > in_h) || - (in_h > height - 0.5)) { - // empty - val[0] = 0.0; - return; - } - - if (in_w < 0) { - in_w = 0; - } - if (in_h < 0) { - in_h = 0; - } - - int in_w_floor = floor(in_w); - int in_h_floor = floor(in_h); - int in_w_ceil; - int in_h_ceil; - - if (GT_E(in_w_floor, width - 1)) { - in_w_ceil = in_w_floor = width - 1; - in_w = static_cast(in_w_floor); - } else { - in_w_ceil = in_w_floor + 1; - } - - if (GT_E(in_h_floor, height - 1)) { - in_h_ceil = in_h_floor = height - 1; - in_h = static_cast(in_h_floor); - } else { - in_h_ceil = in_h_floor + 1; - } - T w_floor = in_w - in_w_floor; - T h_floor = in_h - in_h_floor; - T w_ceil = 1 - w_floor; - T h_ceil = 1 - h_floor; - const T *data = in_data + (in_n * channels + in_c) * height * width; - // Do bilinear interpolation - T v1 = data[in_h_floor * width + in_w_floor]; - T v2 = data[in_h_ceil * width + in_w_floor]; - T v3 = data[in_h_ceil * width + in_w_ceil]; - T v4 = data[in_h_floor * width + in_w_ceil]; - T w1 = w_ceil * h_ceil; - T w2 = w_ceil * h_floor; - T w3 = w_floor * h_floor; - T w4 = w_floor * h_ceil; - val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; -} - -template <> -bool RoiPerspectiveKernel::Init(RoiPerspectiveParam *param) { - return true; -} - -template <> -void RoiPerspectiveKernel::Compute( - const RoiPerspectiveParam ¶m) { - const auto *input_x = param.input_x_; - const auto *input_rois = param.input_rois_; - auto *output = param.output_; - auto *transform_Matrix = param.transform_Matrix_; - auto *mask = param.mask; - - const auto &in_dims = input_x->dims(); - const int channels = in_dims[1]; - const int in_height = in_dims[2]; - const int in_width = in_dims[3]; - const int rois_num = input_rois->dims()[0]; - const int transformed_height = param.transformed_height_; - const int transformed_width = param.transformed_width_; - const float spatial_scale = param.spatial_scale_; - - const float *input_data = input_x->data(); - const float *rois_data = input_rois->data(); - float *output_data = output->mutable_data(); - int *mask_data = mask->mutable_data(); - float *transform_matrix = - transform_Matrix->mutable_data({rois_num, 9}); - - std::vector roi2image(rois_num); - const auto &lod = input_rois->lod().back(); - for (size_t i = 0; i < lod.size() - 1; ++i) { - for (size_t j = lod[i]; j < lod[i + 1]; ++j) { - roi2image[j] = i; - } - } - - for (int n = 0; n < rois_num; ++n) { - const float *n_rois = rois_data + n * 8; - float roi_x[4]; - float roi_y[4]; - for (int k = 0; k < 4; ++k) { - roi_x[k] = n_rois[2 * k] * spatial_scale; - roi_y[k] = n_rois[2 * k + 1] * spatial_scale; - } - int image_id = roi2image[n]; - // Get transform matrix - // float transform_matrix[9]; - float matrix[9]; - get_transform_matrix(transformed_width, transformed_height, roi_x, - roi_y, matrix); - for (int i = 0; i < 9; i++) { - transform_matrix[n * 9 + i] = matrix[i]; - } - for (int c = 0; c < channels; ++c) { - for (int out_h = 0; out_h < transformed_height; ++out_h) { - for (int out_w = 0; out_w < transformed_width; ++out_w) { - int out_index = - n * channels * transformed_height * transformed_width + - c * transformed_height * transformed_width + - out_h * transformed_width + out_w; - float in_w, in_h; - get_source_coords(matrix, out_w, out_h, &in_w, &in_h); - if (in_quad(in_w, in_h, roi_x, roi_y)) { - if ((-0.5 > in_w) || (in_w > (in_width - 0.5)) || (-0.5 > in_h) || - (in_h > (in_height - 0.5))) { - output_data[out_index] = 0.0; - mask_data[(n * transformed_height + out_h) * transformed_width + - out_w] = 0; - } else { - bilinear_interpolate(input_data, channels, in_width, - in_height, image_id, c, in_w, in_h, - output_data + out_index); - mask_data[(n * transformed_height + out_h) * transformed_width + - out_w] = 1; - } - } else { - output_data[out_index] = 0.0; - mask_data[(n * transformed_height + out_h) * transformed_width + - out_w] = 1; - } - } - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ROI_PERSPECTIVE_OP diff --git a/mobile/src/operators/kernel/arm/scale_kernel.cpp b/mobile/src/operators/kernel/arm/scale_kernel.cpp deleted file mode 100644 index fffcb07533..0000000000 --- a/mobile/src/operators/kernel/arm/scale_kernel.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SCALE_OP - -#include "operators/kernel/scale_kernel.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif - -namespace paddle_mobile { -namespace operators { - -template <> -bool ScaleKernel::Init(ScaleParam *param) { - return true; -} - -template <> -void ScaleKernel::Compute(const ScaleParam ¶m) { - const auto input = param.InputX(); - auto output = param.Out(); - if (input->dims() != output->dims()) { - output->Resize(input->dims()); - } - const float scale = param.Scale(); - const float bias = param.Bias(); - if (input->type() == type_id().hash_code()) { - const int64_t *input_data = input->data(); - int64_t *output_data = output->mutable_data(); - - int i = 0; - for (; i < output->numel(); ++i, ++output_data, ++input_data) { - *output_data = scale * (*input_data) + bias; - } - } else { - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - - int i = 0; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - float32x4_t vscale = vdupq_n_f32(scale); - float32x4_t vbias = vdupq_n_f32(bias); - for (; i < output->numel() - 15; i += 16) { - float32x4_t _in0 = vld1q_f32(input_data); - float32x4_t _in1 = vld1q_f32(input_data + 4); - float32x4_t _in2 = vld1q_f32(input_data + 8); - float32x4_t _in3 = vld1q_f32(input_data + 12); - _in0 = vmlaq_f32(vbias, vscale, _in0); - _in1 = vmlaq_f32(vbias, vscale, _in1); - _in2 = vmlaq_f32(vbias, vscale, _in2); - _in3 = vmlaq_f32(vbias, vscale, _in3); - vst1q_f32(output_data, _in0); - vst1q_f32(output_data + 4, _in1); - vst1q_f32(output_data + 8, _in2); - vst1q_f32(output_data + 12, _in3); - input_data += 16; - output_data += 16; - } - for (; i < output->numel() - 3; i += 4) { - float32x4_t _in0 = vld1q_f32(input_data); - _in0 = vmlaq_f32(vbias, vscale, _in0); - vst1q_f32(output_data, _in0); - input_data += 4; - output_data += 4; - } -#endif - for (; i < output->numel(); ++i, ++output_data, ++input_data) { - *output_data = scale * (*input_data) + bias; - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp deleted file mode 100644 index 82941ff0d5..0000000000 --- a/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_EXPAND_OP - -#include -#include "operators/kernel/sequence_kernels.h" - -namespace paddle_mobile { -namespace operators { - -typedef int (*LoDElementFunctor)(const std::vector &x_lod, int index); - -int element_with_lod(const std::vector &x_lod, int index) { - return x_lod[index]; -} - -int element_without_lod(const std::vector &x_lod, int index) { - return index; -} - -template -inline void SequenceExpandImpl(const framework::LoDTensor &x, - const std::vector &ref_lod, - framework::LoDTensor *output) { - const T *x_data = x.data(); - auto &x_lod = x.lod(); - LoDElementFunctor lod_element = element_without_lod; - if (x_lod.size() == 1) lod_element = element_with_lod; - - T *output_data = output->mutable_data(); - int x_item_length = x.numel() / x.dims()[0]; - int out_offset = 0; - - for (size_t i = 1; i < ref_lod.size(); ++i) { - int repeat_num = ref_lod[i] - ref_lod[i - 1]; - int x_start = lod_element(x_lod[0], i - 1); - int x_end = lod_element(x_lod[0], i); - int x_seq_len = x_end - x_start; - if (repeat_num > 0) { - int out_start = out_offset; - if (output->lod().size() == 1) { - out_start = output->lod()[0][out_offset]; - } - for (int j = 0; j < repeat_num; j++) { - for (int k = 0; k < x_seq_len; k++) { - memcpy(output_data + (out_start + j * x_seq_len + k) * x_item_length, - x_data + (x_start + k) * x_item_length, - x_item_length * sizeof(T)); - } - } - } - out_offset += repeat_num; - } -} - -template -class SequenceExpandKernel - : public framework::OpKernelBase> { - public: - bool Init(SequenceExpandParam *param) { return true; } - - void Compute(const SequenceExpandParam ¶m) { - const framework::LoDTensor *input_x = param.input_x_; - const framework::LoDTensor *input_y = param.input_y_; - framework::LoDTensor *output = param.output_; - output->mutable_data(); - - const auto &x_lod = input_x->lod(); - const auto &y_lod = input_y->lod(); - int ref_level = param.ref_level_; - if (ref_level == -1) ref_level = y_lod.size() - 1; - - if (y_lod[ref_level].size() <= 1) { - framework::TensorCopy(*input_x, output); - output->set_lod(input_x->lod()); - return; - } - - std::vector out_lod; - if (x_lod.size() == 1) { - out_lod.push_back(0); - for (size_t i = 1; i < y_lod[ref_level].size(); ++i) { - int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; - int x_start = x_lod[0][i - 1]; - int x_end = x_lod[0][i]; - int x_seq_len = x_end - x_start; - for (int j = 0; j < repeat_num; ++j) { - out_lod.push_back(out_lod.back() + x_seq_len); - } - } - output->set_lod({out_lod}); - } - SequenceExpandImpl(*input_x, y_lod[ref_level], output); - } -}; - -template class SequenceExpandKernel; -// template class SequenceExpandKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif // SEQUENCE_EXPAND_OP diff --git a/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp deleted file mode 100644 index db1939d4d0..0000000000 --- a/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_POOL_OP - -#include -#include -#include -#include -#include "common/types.h" -#include "operators/kernel/sequence_kernels.h" -#include "operators/math/pooling.h" -#ifdef __ARM_NEON__ -#include -#endif // __ARM_NEON__ - -namespace paddle_mobile { -namespace operators { - -template -void SequencePoolImpl(const framework::LoDTensor &input, - framework::LoDTensor *output) { - const float *input_ptr = input.data(); - float *output_ptr = output->mutable_data(); - const auto &lod = input.lod()[0]; - int64_t width = input.numel() / input.dims()[0]; - - #pragma omp parallel for - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - const float *in_ptr = input_ptr + lod[i] * width; - float *out_ptr = output_ptr + i * width; - int64_t height = static_cast(lod[i + 1] - lod[i]); - if (width == 1) { - float max = -std::numeric_limits::max(); - int remain_h = height; -#ifdef __ARM_NEON__ - int loop = remain_h >> 2; - remain_h = remain_h & 0x3; - float32x4_t __max4 = math::vPoolInitq_f32(); - for (int h = 0; h < loop; ++h) { - float32x4_t r0 = vld1q_f32(in_ptr); - __max4 = vmaxq_f32(__max4, r0); - in_ptr += 4; - } - float32x2_t __max2 = - vpmax_f32(vget_low_f32(__max4), vget_high_f32(__max4)); - __max2 = vpmax_f32(__max2, __max2); - max = std::max(max, vget_lane_f32(__max2, 0)); -#endif // __ARM_NEON__ - for (int h = 0; h < remain_h; ++h) { - max = std::max(max, in_ptr[h]); - } - *out_ptr = max; - } else { - memcpy(out_ptr, in_ptr, width * sizeof(float)); - in_ptr += width; - int remain_h = height - 1; - int remain_w_start = 0; -#ifdef __ARM_NEON__ - remain_w_start = width & 0xfffffffc; -#endif // __ARM_NEON__ - for (int h = 0; h < remain_h; ++h) { -#ifdef __ARM_NEON__ - for (int w = 0; w < width; w += 4) { - float32x4_t __in = vld1q_f32(in_ptr + w); - float32x4_t __out = vld1q_f32(out_ptr + w); - __out = vmaxq_f32(__out, __in); - vst1q_f32(out_ptr + w, __out); - } -#endif // __ARM_NEON__ - for (int w = remain_w_start; w < width; ++w) { - out_ptr[w] = std::max(out_ptr[w], in_ptr[w]); - } - in_ptr += width; - } - } - } -} - -template <> -void SequencePoolImpl(const framework::LoDTensor &input, - framework::LoDTensor *output) { - const float *input_ptr = input.data(); - float *output_ptr = output->mutable_data(); - const auto &lod = input.lod()[0]; - int64_t width = input.numel() / input.dims()[0]; - - #pragma omp parallel for - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - const float *in_ptr = input_ptr + lod[i] * width; - float *out_ptr = output_ptr + i * width; - int64_t height = static_cast(lod[i + 1] - lod[i]); - if (width == 1) { - float sum = 0.f; - int remain_h = height; -#ifdef __ARM_NEON__ - int loop = remain_h >> 2; - remain_h = remain_h & 0x3; - float32x4_t __sum4 = vdupq_n_f32(0.f); - for (int h = 0; h < loop; ++h) { - float32x4_t r0 = vld1q_f32(in_ptr); - __sum4 = vaddq_f32(__sum4, r0); - in_ptr += 4; - } - float32x2_t __sum2 = - vpadd_f32(vget_low_f32(__sum4), vget_high_f32(__sum4)); - sum += vget_lane_f32(__sum2, 0) + vget_lane_f32(__sum2, 1); -#endif // __ARM_NEON__ - for (int h = 0; h < remain_h; ++h) { - sum += in_ptr[h]; - } - *out_ptr = sum; - } else { - memcpy(out_ptr, in_ptr, width * sizeof(float)); - in_ptr += width; - int remain_h = height - 1; - int remain_w_start = 0; -#ifdef __ARM_NEON__ - int loop_w = width >> 2; - remain_w_start = width & 0xfffffffc; -#endif // __ARM_NEON__ - for (int h = 0; h < remain_h; ++h) { -#ifdef __ARM_NEON__ - for (int w = 0; w < width - 3; w += 4) { - float32x4_t __in = vld1q_f32(in_ptr + w); - float32x4_t __out = vld1q_f32(out_ptr + w); - __out = vaddq_f32(__out, __in); - vst1q_f32(out_ptr + w, __out); - } -#endif // __ARM_NEON__ - for (int w = remain_w_start; w < width; ++w) { - out_ptr[w] += in_ptr[w]; - } - in_ptr += width; - } - } - } -} - -template <> -void SequencePoolImpl(const framework::LoDTensor &input, - framework::LoDTensor *output) { - const float *input_ptr = input.data(); - float *output_ptr = output->mutable_data(); - const auto &lod = input.lod()[0]; - int64_t width = input.numel() / input.dims()[0]; - - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - const float *in_ptr = input_ptr + lod[i] * width; - float *out_ptr = output_ptr + i * width; - memcpy(out_ptr, in_ptr, width * sizeof(float)); - } -} - -template <> -void SequencePoolImpl(const framework::LoDTensor &input, - framework::LoDTensor *output) { - const float *input_ptr = input.data(); - float *output_ptr = output->mutable_data(); - const auto &lod = input.lod()[0]; - int64_t width = input.numel() / input.dims()[0]; - - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - int64_t seq_len = static_cast(lod[i + 1] - lod[i]); - const float *in_ptr = input_ptr + seq_len * width; - float *out_ptr = output_ptr + i * width; - memcpy(out_ptr, in_ptr - width, width * sizeof(float)); - } -} - -template -class SequencePoolKernel - : public framework::OpKernelBase> { - public: - bool Init(SequencePoolParam *param) { return true; } - - void Compute(const SequencePoolParam ¶m) { - const framework::LoDTensor *input = param.input_; - framework::LoDTensor *output = param.output_; - output->mutable_data(); - const std::string pooling_type = param.pool_type_; - - if (param.pool_type_ == "MAX") { - SequencePoolImpl(*input, output); - } else if (param.pool_type_ == "FIRST") { - SequencePoolImpl(*input, output); - } else if (param.pool_type_ == "LAST") { - SequencePoolImpl(*input, output); - } else if (param.pool_type_ == "SUM") { - SequencePoolImpl(*input, output); - } else { - PADDLE_MOBILE_THROW_EXCEPTION( - "pooling type `%s` has not been implemented.", - param.pool_type_.c_str()); - } - } -}; - -template class SequencePoolKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif // SEQUENCE_POOL_OP diff --git a/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp deleted file mode 100644 index b0df21fac5..0000000000 --- a/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_SOFTMAX_OP - -#include "framework/lod_tensor.h" -#include "operators/kernel/sequence_kernels.h" -#include "operators/math/softmax.h" - -namespace paddle_mobile { -namespace operators { - -template -class SequenceSoftmaxKernel - : public framework::OpKernelBase> { - public: - bool Init(SoftmaxParam *param) { return true; } - - void Compute(const SoftmaxParam ¶m) { - param.Out()->mutable_data(); - const framework::LoDTensor *input = param.InputX(); - framework::LoDTensor *output = param.Out(); - math::SequenceSoftmaxFuntor sequence_softmax; - sequence_softmax(input, output); - } -}; - -template class SequenceSoftmaxKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif // SEQUENCE_SOFTMAX_OP diff --git a/mobile/src/operators/kernel/arm/shape_kernel.cpp b/mobile/src/operators/kernel/arm/shape_kernel.cpp deleted file mode 100644 index 4adbf8fa13..0000000000 --- a/mobile/src/operators/kernel/arm/shape_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SHAPE_OP - -#include "operators/kernel/shape_kernel.h" -#include "operators/kernel/central-arm-func/shape_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ShapeKernel::Init(ShapeParam *param) { - return true; -} - -template <> -void ShapeKernel::Compute(const ShapeParam ¶m) { - ShapeCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/slice_kernel.cpp b/mobile/src/operators/kernel/arm/slice_kernel.cpp deleted file mode 100644 index aeb18c8d20..0000000000 --- a/mobile/src/operators/kernel/arm/slice_kernel.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SLICE_OP - -#include "operators/kernel/slice_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template -void SliceCompute(const SliceParam& param) { - auto input = param.input_; - auto output = param.output_; - auto* input_ptr = input->data(); - auto* output_ptr = output->mutable_data(); - auto out_dims = output->dims(); - auto in_dims = input->dims(); - auto starts = param.starts_; - auto ends = param.ends_; - int axes = param.axes_[0]; - int HW = 1; - if (in_dims.size() >= 2 && axes <= in_dims.size() - 2) { - HW = in_dims[axes + 1] * input->dims()[axes + 2]; - } - int batch_size = (out_dims.size() == 1) ? 1 : out_dims[axes - 1]; - int input_channel = in_dims[axes]; - int output_channel = out_dims[axes]; - - for (int c1 = 0; c1 < batch_size; ++c1) { - for (int c2 = starts[0], c3 = 0; c2 < ends[0]; ++c2, ++c3) { - size_t out_offset = c1 * output_channel * HW + c3 * HW; - size_t in_offset = c1 * input_channel * HW + c2 * HW; - memcpy(output_ptr + out_offset, input_ptr + in_offset, - HW * sizeof(float)); - } - } -} - -template <> -bool SliceKernel::Init(SliceParam* param) { - return true; -} - -template <> -void SliceKernel::Compute(const SliceParam& param) { - int rank = param.input_->dims().size(); - switch (rank) { - case 1: - if (param.input_->type() == type_id().hash_code()) { - SliceCompute(param); - } else if (param.input_->type() == type_id().hash_code()) { - SliceCompute(param); - } - break; - case 2: - SliceCompute(param); - break; - case 4: - SliceCompute(param); - break; - case 5: - if (param.input_->dims()[0] == 1) { - SliceCompute(param); - } - break; - default: - PADDLE_MOBILE_ENFORCE(0, "input dims not support now"); - break; - } -} - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/arm/softmax_kernel.cpp b/mobile/src/operators/kernel/arm/softmax_kernel.cpp deleted file mode 100644 index bdb05656d4..0000000000 --- a/mobile/src/operators/kernel/arm/softmax_kernel.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#include "../softmax_kernel.h" -#include "../central-arm-func/softmax_arm_func.h" -#include "operators/math/softmax.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool SoftmaxKernel::Init(SoftmaxParam *param) { - return true; -} - -template <> -void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { - SoftmaxCompute(param); - param.Out()->set_lod(param.InputX()->lod()); -} - -template class SoftmaxKernel; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/split_kernel.cpp b/mobile/src/operators/kernel/arm/split_kernel.cpp deleted file mode 100644 index 13c7567e3d..0000000000 --- a/mobile/src/operators/kernel/arm/split_kernel.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SPLIT_OP - -#include "operators/kernel/split_kernel.h" -#include "operators/kernel/central-arm-func/split_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SplitKernel::Init(SplitParam *param) { - return true; -} - -template <> -void SplitKernel::Compute(const SplitParam ¶m) { - SplitCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/sum_kernel.cpp b/mobile/src/operators/kernel/arm/sum_kernel.cpp deleted file mode 100644 index 2b36a382a1..0000000000 --- a/mobile/src/operators/kernel/arm/sum_kernel.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SUM_OP - -#include "operators/kernel/sum_kernel.h" -#include "operators/kernel/central-arm-func/sum_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SumKernel::Init(SumParam *param) { - return true; -} - -template <> -void SumKernel::Compute(const SumParam ¶m) { - SumCompute(param); - param.Out()->set_lod(param.Inputs()[0]->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp b/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp deleted file mode 100644 index bdf10574a8..0000000000 --- a/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/tensor_array_read_write_kernel.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef WRITE_TO_ARRAY_OP -template <> -bool WriteToArrayKernel::Init(WriteToArrayParam *param) { - return true; -} - -template <> -void WriteToArrayKernel::Compute( - const WriteToArrayParam ¶m) { - int64_t offset = param.index_->data()[0]; - if (offset >= param.output_->size()) { - while (param.output_->size() <= offset) { - param.output_->emplace_back(); - } - } - - framework::LoDTensor *out_tensor = &(param.output_->at(offset)); - out_tensor->set_lod(param.input_->lod()); - if (param.input_->memory_size() > 0) { - TensorCopy(*(param.input_), out_tensor); - } -} -#endif // WRITE_TO_ARRAY_OP - -#ifdef READ_FROM_ARRAY_OP -template <> -bool ReadFromArrayKernel::Init(ReadFromArrayParam *param) { - return true; -} - -template <> -void ReadFromArrayKernel::Compute( - const ReadFromArrayParam ¶m) { - int64_t offset = param.index_->data()[0]; - if (offset < param.input_->size()) { - TensorCopy(param.input_->at(offset), param.output_); - param.output_->set_lod(param.input_->at(offset).lod()); - } else { - PADDLE_MOBILE_THROW_EXCEPTION( - "Can not read tensor which index is `%d` since it only has `%d` inputs", - offset, param.input_->size()); - } -} -#endif // READ_FROM_ARRAY_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/arm/top_k_kernel.cpp b/mobile/src/operators/kernel/arm/top_k_kernel.cpp deleted file mode 100644 index 54a4f5b1a9..0000000000 --- a/mobile/src/operators/kernel/arm/top_k_kernel.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TOP_K_OP - -#include -#include -#include -#include "operators/kernel/kernels.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool TopKKernel::Init(TopKParam *param) { - return true; -} - -template <> -void TopKKernel::Compute(const TopKParam ¶m) { - const Tensor *input = param.input_; - Tensor *output = param.output_; - Tensor *indices = param.indices_; - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - int64_t *indices_data = indices->mutable_data(); - - framework::DDim input_dims = input->dims(); - const size_t row = framework::product( - framework::slice_ddim(input_dims, 0, input_dims.size() - 1)); - const size_t col = input_dims[input_dims.size() - 1]; - - #pragma omp parallel for - for (size_t i = 0; i < row; i++) { - std::vector> vec(col); - const float *input_ptr = input_data + i * col; - float *output_ptr = output_data + i * param.k_; - int64_t *indices_ptr = indices_data + i * param.k_; - - for (size_t j = 0; j < col; j++) { - vec[j] = std::move(std::pair(input_ptr[j], j)); - } - std::partial_sort( - vec.begin(), vec.begin() + param.k_, vec.end(), - [](const std::pair &l, - const std::pair &r) { return l.first > r.first; }); - for (int j = 0; j < param.k_; ++j) { - output_ptr[j] = vec[j].first; - indices_ptr[j] = static_cast(vec[j].second); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // TOP_K_OP diff --git a/mobile/src/operators/kernel/arm/transpose2_kernel.cpp b/mobile/src/operators/kernel/arm/transpose2_kernel.cpp deleted file mode 100644 index 54c88015cb..0000000000 --- a/mobile/src/operators/kernel/arm/transpose2_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TRANSPOSE2_OP - -#include "operators/kernel/transpose2_kernel.h" - -namespace paddle_mobile { -namespace operators { - -bool IsShuffleChannel(const std::vector &axis) { - bool is_shuffle_channel = true; - if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) { - for (int i = 3; i < axis.size(); ++i) { - if (axis[i] != i) { - is_shuffle_channel = false; - break; - } - } - } else { - return false; - } - return is_shuffle_channel; -} - -template -void ShuffleChannelCompute(const Transpose2Param ¶m) { - const std::vector &axis = param.Axis(); - const Tensor *input = param.InputX(); - const Dtype *input_ptr = input->data(); - Tensor *output = param.Out(); - Dtype *output_ptr = output->mutable_data(); - // input and output's shape dimension must >= 2 && <= 6. - const framework::DDim &in_dim = input->dims(); - const framework::DDim &out_dim = output->dims(); - size_t offset = 1; - for (int i = 3; i < axis.size(); ++i) { - offset *= in_dim[i]; - } - - #pragma omp parallel for collapse(3) - for (int batch = 0; batch < out_dim[0]; ++batch) { - for (int c1 = 0; c1 < out_dim[1]; ++c1) { - for (int c2 = 0; c2 < out_dim[2]; ++c2) { - size_t out_offset = - ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset; - size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset; - memcpy(output_ptr + out_offset, input_ptr + in_offset, - offset * sizeof(Dtype)); - } - } - } -} - -template -void Transpose2Compute(const Transpose2Param ¶m) { - const std::vector &axis = param.Axis(); - const Tensor *input = param.InputX(); - const Dtype *input_ptr = input->data(); - Tensor *output = param.Out(); - Dtype *output_ptr = output->mutable_data(); - // input and output's shape dimension must >= 2 && <= 6. - const framework::DDim &in_dim = input->dims(); - const framework::DDim &out_dim = output->dims(); - - // precompute inverted output dim and strides - size_t rout_dim[6], strides[6]; - int permute = axis.size(); // permute must >=2 && <= 6. - for (int i = 0; i < permute; ++i) { - int k = permute - 1 - i; - strides[k] = 1; - for (int j = axis[i] + 1; j < permute; ++j) { - strides[k] *= in_dim[j]; - } - rout_dim[k] = out_dim[i]; - } - // unroll the first 2 dimensions - int reamin_dim = 1; - for (int i = 2; i < out_dim.size(); ++i) { - reamin_dim *= out_dim[i]; - } - - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < out_dim[0]; ++batch) { - for (int j = 0; j < out_dim[1]; ++j) { - size_t offset = batch * strides[permute - 1] + j * strides[permute - 2]; - Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim; - int indics[4] = {0, 0, 0, 0}; - for (int k = 0; k < reamin_dim; ++k) { - out_ptr[k] = input_ptr[offset]; - indics[0] += 1; - offset += strides[0]; - for (int p = 0; p < permute - 3; ++p) { - if (indics[p] == rout_dim[p]) { - indics[p + 1] += 1; - indics[p] = 0; - offset += strides[p + 1]; - offset -= rout_dim[p] * strides[p]; - } else { - break; - } - } - } - } - } -} - -template <> -bool Transpose2Kernel::Init(Transpose2Param *param) { - return true; -} - -template <> -void Transpose2Kernel::Compute(const Transpose2Param ¶m) { - const std::vector &axis = param.Axis(); - bool shuffle_channel = IsShuffleChannel(axis); - if (shuffle_channel) { - if (param.InputX()->type() == type_id().hash_code()) { - ShuffleChannelCompute(param); - } else { - ShuffleChannelCompute(param); - } - } else { - if (param.InputX()->type() == type_id().hash_code()) { - Transpose2Compute(param); - } else { - Transpose2Compute(param); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // TRANSPOSE2_OP diff --git a/mobile/src/operators/kernel/arm/transpose_kernel.cpp b/mobile/src/operators/kernel/arm/transpose_kernel.cpp deleted file mode 100644 index f90376eb50..0000000000 --- a/mobile/src/operators/kernel/arm/transpose_kernel.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef TRANSPOSE_OP - -#include "operators/kernel/transpose_kernel.h" -#include "operators/kernel/central-arm-func/transpose_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool TransposeKernel::Init(TransposeParam *param) { - return true; -} - -template <> -void TransposeKernel::Compute(const TransposeParam ¶m) { - TransposeCompute(param); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/arm/while_kernel.cpp b/mobile/src/operators/kernel/arm/while_kernel.cpp deleted file mode 100644 index 43e88aad4d..0000000000 --- a/mobile/src/operators/kernel/arm/while_kernel.cpp +++ /dev/null @@ -1,128 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef WHILE_OP - -#include "operators/kernel/while_kernel.h" -#include "framework/loader.h" -#include "framework/lod_tensor.h" -#include "framework/op_registry.h" -#include "framework/operator.h" - -namespace paddle_mobile { -namespace operators { - -class WhileStepExecutor { - typedef std::shared_ptr> OperatorPtr; - - public: - WhileStepExecutor(const framework::BlockDesc *block, framework::Scope *scope) - : scope_(scope) { - std::vector> ops = block->Ops(); - ops_of_block_.resize(ops.size()); - for (int i = 0; i < ops.size(); ++i) { - std::shared_ptr op_desc = ops[i]; - DLOG << "while kernel create op: " << op_desc->Type(); - auto op_handler = framework::OpRegistry::CreateOp( - op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(), - op_desc->GetAttrMap(), scope_); - op_handler->Init(); - ops_of_block_[i] = op_handler; - } - } - - void Run() { - for (int i = 0; i < ops_of_block_.size(); ++i) { - auto &op_handler = ops_of_block_[i]; - DLOG << "while kernel InferShape op: " << i - << "th : " << op_handler->Type(); - op_handler->InferShape(); - DLOG << "while kernel Run op: " << i << "th : " << op_handler->Type(); - op_handler->Run(); - } - } - - void CreateVariables(Scope &scope, const WhileParam ¶m) { - for (const auto &var_desc : param.sub_block_->Vars()) { - auto var = scope.Var(var_desc->Name()); - if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { - if (var_desc->Persistable()) { - auto dim = var_desc->Tensor_desc().Dims(); - auto tensor = var->framework::Variable::GetMutable(); - tensor->Resize(framework::make_ddim(dim)); - } else { - auto dim = var_desc->Tensor_desc().Dims(); - if (dim.size() == 0) { - auto tensor = var->framework::Variable::GetMutable(); - framework::DDim dDim = {0}; - tensor->Resize(dDim); - } else { - for (auto &d : dim) { - if (d < 0) { - d *= -1; - } - } - auto tensor = var->framework::Variable::GetMutable(); - tensor->Resize(framework::make_ddim(dim)); - } - } - } else { - // TODO(codeWorm) - } - } - } - - private: - framework::Scope *scope_; - std::vector ops_of_block_; -}; - -template <> -bool WhileKernel::Init(WhileParam *param) { - return true; -} - -template <> -void WhileKernel::Compute(const WhileParam ¶m) { - DLOG << "WhileKernel Compute"; - WhileStepExecutor executor(param.sub_block_, param.scope_); - auto ¤t_scope = param.scope_->NewScope(); - executor.CreateVariables(current_scope, param); - while (param.cond_->data()[0]) { - if (param.is_test) { - for (auto &name : current_scope.LocalVarNames()) { - auto *var = current_scope.Var(name); - if (var->IsType()) { - // Clear all lod information for all lod_tensors. - auto *t = var->GetMutable(); - framework::LoD empty_lod; - t->set_lod(empty_lod); - } else if (var->IsType()) { - // Clear elements of all tensor arrays. - auto *t = var->GetMutable(); - t->clear(); - } else { - // todo - } - } - } - executor.Run(); - } - param.scope_->DeleteScope(¤t_scope); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // WHILE_OP diff --git a/mobile/src/operators/kernel/assign_kernel.h b/mobile/src/operators/kernel/assign_kernel.h deleted file mode 100644 index 0d06bb7521..0000000000 --- a/mobile/src/operators/kernel/assign_kernel.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ASSIGN_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class AssignParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - AssignParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = OpParam::InputXFrom(inputs, *scope); - output_ = OpParam::OutFrom(outputs, *scope); - } - - const GType *Input() const { return input_; } - - GType *Output() const { return output_; } - - private: - GType *input_; - GType *output_; -}; - -DECLARE_KERNEL(Assign, AssignParam); - -} // namespace operators -} // namespace paddle_mobile - -#endif // ASSIGN_OP diff --git a/mobile/src/operators/kernel/assign_value_kernel.h b/mobile/src/operators/kernel/assign_value_kernel.h deleted file mode 100644 index 5fae921876..0000000000 --- a/mobile/src/operators/kernel/assign_value_kernel.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ASSIGN_VALUE_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class AssignValueParam : public OpParam { - public: - AssignValueParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - output_ = GET_VAR_AS_LOD_TENSOR("Out", outputs, *scope); - shape_ = OpParam::GetAttr>("shape", attrs); - fp32_values_ = OpParam::GetAttr>("fp32_values", attrs); - int32_values_ = OpParam::GetAttr>("int32_values", attrs); - dtype_ = OpParam::GetAttr("dtype", attrs); - } - - public: - framework::LoDTensor *output_; - std::vector shape_; - std::vector fp32_values_; - std::vector int32_values_; - int dtype_; -}; - -DECLARE_KERNEL(AssignValue, AssignValueParam); - -} // namespace operators -} // namespace paddle_mobile - -#endif // ASSIGN_VALUE_OP diff --git a/mobile/src/operators/kernel/batchnorm_kernel.h b/mobile/src/operators/kernel/batchnorm_kernel.h deleted file mode 100644 index 1f2db456d3..0000000000 --- a/mobile/src/operators/kernel/batchnorm_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BATCHNORM_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class BatchNormKernel - : public framework::OpKernelBase> { - public: - void Compute(const BatchNormParam ¶m); - bool Init(BatchNormParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/beam_search_decode_kernel.h b/mobile/src/operators/kernel/beam_search_decode_kernel.h deleted file mode 100644 index 36cc7f9f2d..0000000000 --- a/mobile/src/operators/kernel/beam_search_decode_kernel.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BEAM_SEARCH_DECODE_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class BeamSearchDecodeParam : public OpParam { - public: - BeamSearchDecodeParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - ids_ = - OpParam::GetVarValue("Ids", inputs, *scope); - scores_ = OpParam::GetVarValue("Scores", inputs, - *scope); - sentence_ids_ = OpParam::GetVarValue("SentenceIds", - outputs, *scope); - sentence_scores_ = OpParam::GetVarValue( - "SentenceScores", outputs, *scope); - beam_size_ = OpParam::GetAttr("beam_size", attrs); - end_id_ = OpParam::GetAttr("end_id", attrs); - } - - public: - framework::LoDTensorArray *ids_; - framework::LoDTensorArray *scores_; - framework::LoDTensor *sentence_ids_; - framework::LoDTensor *sentence_scores_; - int beam_size_; - int end_id_; -}; - -DECLARE_KERNEL(BeamSearchDecode, BeamSearchDecodeParam); - -} // namespace operators -} // namespace paddle_mobile - -#endif // BEAM_SEARCH_DECODE_OP diff --git a/mobile/src/operators/kernel/beam_search_kernel.h b/mobile/src/operators/kernel/beam_search_kernel.h deleted file mode 100644 index bb4a3ced17..0000000000 --- a/mobile/src/operators/kernel/beam_search_kernel.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BEAM_SEARCH_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class BeamSearchParam : public OpParam { - public: - BeamSearchParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - pre_ids_ = GET_VAR_AS_LOD_TENSOR("pre_ids", inputs, *scope); - pre_scores_ = GET_VAR_AS_LOD_TENSOR("pre_scores", inputs, *scope); - ids_ = GET_VAR_AS_LOD_TENSOR("ids", inputs, *scope); - scores_ = GET_VAR_AS_LOD_TENSOR("scores", inputs, *scope); - - selected_ids_ = GET_VAR_AS_LOD_TENSOR("selected_ids", outputs, *scope); - selected_scores_ = - GET_VAR_AS_LOD_TENSOR("selected_scores", outputs, *scope); - if (outputs.count("parent_idx")) { - parent_idx_ = GET_VAR_AS_LOD_TENSOR("parent_idx", outputs, *scope); - } else { - parent_idx_ = new framework::Tensor(); - } - - level_ = OpParam::GetAttr("level", attrs); - beam_size_ = OpParam::GetAttr("beam_size", attrs); - end_id_ = OpParam::GetAttr("end_id", attrs); - if (OpParam::HasAttr("is_accumulated", attrs)) { - is_accumulated_ = OpParam::GetAttr("is_accumulated", attrs); - } - } - - public: - framework::LoDTensor *pre_ids_; - framework::LoDTensor *pre_scores_; - framework::LoDTensor *ids_; - framework::LoDTensor *scores_; - - framework::LoDTensor *selected_ids_; - framework::LoDTensor *selected_scores_; - framework::Tensor *parent_idx_; - - int level_; - int beam_size_; - int end_id_; - bool is_accumulated_ = true; -}; - -DECLARE_KERNEL(BeamSearch, BeamSearchParam); - -} // namespace operators -} // namespace paddle_mobile - -#endif // BEAM_SEARCH_OP diff --git a/mobile/src/operators/kernel/bilinear_interp_kernel.h b/mobile/src/operators/kernel/bilinear_interp_kernel.h deleted file mode 100644 index 9a68fe65a5..0000000000 --- a/mobile/src/operators/kernel/bilinear_interp_kernel.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BILINEAR_INTERP_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class BilinearInterpKernel - : public framework::OpKernelBase> { - public: - void Compute(const BilinearInterpParam& param); - bool Init(BilinearInterpParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/box_coder_kernel.h b/mobile/src/operators/kernel/box_coder_kernel.h deleted file mode 100644 index eadb21b3d5..0000000000 --- a/mobile/src/operators/kernel/box_coder_kernel.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BOXCODER_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/math/transform.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class BoxCoderKernel - : public framework::OpKernelBase> { - public: - void Compute(const BoxCoderParam& param); - bool Init(BoxCoderParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h b/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h deleted file mode 100644 index 07663ae2ae..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "operators/math/activation.h" -#include "operators/op_param.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif // __ARM_NEON__ - -namespace paddle_mobile { -namespace operators { - -template -struct ActivationCompute { - void operator()(const Tensor *input, Tensor *output) {} - void operator()(const Tensor *input, Tensor *output, float alpha) {} -}; - -template -struct ActivationCompute { - void operator()(const Tensor *input, Tensor *output) { - const float *x = input->data(); - float *y = output->mutable_data(); - size_t remain = input->numel(); -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = remain >> 4; - remain = remain & 0xF; - -#pragma omp parallel for - for (size_t i = 0; i < loop; ++i) { - const float *local_x = x + (i << 4); - float *local_y = y + (i << 4); - float32x4_t r0 = vld1q_f32(local_x); - float32x4_t r1 = vld1q_f32(local_x + 4); - float32x4_t r2 = vld1q_f32(local_x + 8); - float32x4_t r3 = vld1q_f32(local_x + 12); - r0 = math::vActiveq_f32(r0); - r1 = math::vActiveq_f32(r1); - r2 = math::vActiveq_f32(r2); - r3 = math::vActiveq_f32(r3); - vst1q_f32(local_y, r0); - vst1q_f32(local_y + 4, r1); - vst1q_f32(local_y + 8, r2); - vst1q_f32(local_y + 12, r3); - } - x += (loop << 4); - y += (loop << 4); -#endif - for (size_t i = 0; i < remain; ++i) { - y[i] = math::Active(x[i]); - } - } - - void operator()(const Tensor *input, Tensor *output, float falpha) { - const float *x = input->data(); - float *y = output->mutable_data(); - size_t remain = input->numel(); - float alphas[4] = {falpha, falpha, falpha, falpha}; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = remain >> 4; - remain = remain & 0xF; - -#pragma omp parallel for - for (size_t i = 0; i < loop; ++i) { - const float *local_x = x + (i << 4); - float *local_y = y + (i << 4); - float32x4_t r0 = vld1q_f32(local_x); - float32x4_t r1 = vld1q_f32(local_x + 4); - float32x4_t r2 = vld1q_f32(local_x + 8); - float32x4_t r3 = vld1q_f32(local_x + 12); - float32x4_t a_r0 = vld1q_f32(alphas); - float32x4_t a_r1 = vld1q_f32(alphas); - float32x4_t a_r2 = vld1q_f32(alphas); - float32x4_t a_r3 = vld1q_f32(alphas); - r0 = math::vActiveq_f32(r0, a_r0); - r1 = math::vActiveq_f32(r1, a_r1); - r2 = math::vActiveq_f32(r2, a_r2); - r3 = math::vActiveq_f32(r3, a_r3); - vst1q_f32(local_y, r0); - vst1q_f32(local_y + 4, r1); - vst1q_f32(local_y + 8, r2); - vst1q_f32(local_y + 12, r3); - } - x += (loop << 4); - y += (loop << 4); -#endif - for (size_t i = 0; i < remain; ++i) { - y[i] = math::Active(x[i], falpha); - } - } -}; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h deleted file mode 100644 index 300cd32a69..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BATCHNORM_OP - -#pragma once - -#include -#include "operators/op_param.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif // __ARM_NEON__ - -namespace paddle_mobile { -namespace operators { - -template -void BatchnormCompute(const BatchNormParam ¶m) { - const float epsilon = param.Epsilon(); - const float *mean_ptr = param.InputMean()->data(); - const float *variance_ptr = param.InputVariance()->data(); - const float *scale_ptr = param.InputScale()->data(); - const float *bias_ptr = param.InputBias()->data(); - - const framework::Tensor *input = param.InputX(); - const float *input_ptr = input->data(); - framework::Tensor *output = param.OutputY(); - float *output_ptr = output->mutable_data(); - size_t spatial_size = output->dims()[2] * output->dims()[3]; - int channels = output->dims()[1]; - - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < output->dims()[0]; ++batch) { - for (int c = 0; c < channels; ++c) { - float inv_scale = 1.f / (std::sqrt(variance_ptr[c] + epsilon)); - float bias = bias_ptr[c] - inv_scale * scale_ptr[c] * mean_ptr[c]; - float scale = inv_scale * scale_ptr[c]; - size_t offset = (batch * channels + c) * spatial_size; - const float *x = input_ptr + offset; - float *y = output_ptr + offset; - size_t remain = spatial_size; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - int loop = spatial_size >> 4; - remain = spatial_size & 0xF; - float32x4_t __scale = vdupq_n_f32(scale); - float32x4_t __bias = vdupq_n_f32(bias); - for (int k = 0; k < loop; ++k, x += 16, y += 16) { - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); - r0 = vmlaq_f32(__bias, __scale, r0); - r1 = vmlaq_f32(__bias, __scale, r1); - r2 = vmlaq_f32(__bias, __scale, r2); - r3 = vmlaq_f32(__bias, __scale, r3); - vst1q_f32(y, r0); - vst1q_f32(y + 4, r1); - vst1q_f32(y + 8, r2); - vst1q_f32(y + 12, r3); - } -#endif // __ARM_NEON__ - for (int k = 0; k < remain; ++k) { - y[k] = scale * x[k] + bias; - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h b/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h deleted file mode 100644 index 3840985ab8..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BILINEAR_INTERP_OP -#pragma once - -#include -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void BilinearInterpCompute(const BilinearInterpParam& param) { - auto out_dims = param.Out()->dims(); - auto* input = param.InputX()->data(); - auto out_size_t = param.InputOutPutSize(); - - int out_h = param.OutH(); - int out_w = param.OutW(); - if (out_size_t != nullptr) { - auto out_size_data = out_size_t->data(); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - auto* output = param.Out()->mutable_data( - {out_dims[0], out_dims[1], out_h, out_w}); - auto batch_size = param.InputX()->dims()[0]; - auto channels = param.InputX()->dims()[1]; - auto in_h = param.InputX()->dims()[2]; - auto in_w = param.InputX()->dims()[3]; - - auto in_hw = in_h * in_w; - auto out_hw = out_h * out_w; - auto in_chw = channels * in_hw; - auto out_chw = channels * out_hw; - - float ratio_h = - (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - float ratio_w = - (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; - - if (in_h == out_h && in_w == out_w) { - memcpy(output, input, param.InputX()->numel() * sizeof(float)); - } else { - for (int k = 0; k < batch_size; ++k) { // loop for batches - for (int i = 0; i < out_h; ++i) { // loop for images - int h = ratio_h * i; - int hid = (h < in_h - 1) ? 1 : 0; - float h1lambda = ratio_h * i - h; - float h2lambda = 1.f - h1lambda; - - for (int j = 0; j < out_w; ++j) { - int w = ratio_w * j; - int wid = (w < in_w - 1) ? 1 : 0; - float w1lambda = ratio_w * j - w; - float w2lambda = 1.f - w1lambda; - // calculate four position for bilinear interpolation - const float* in_pos = &input[k * in_chw + h * in_w + w]; - float* out_pos = &output[k * out_chw + i * out_w + j]; - - for (int c = 0; c < channels; ++c) { // loop for channels - // bilinear interpolation - out_pos[0] = static_cast( - h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) + - h1lambda * (w2lambda * in_pos[hid * in_w] + - w1lambda * in_pos[hid * in_w + wid])); - in_pos += in_hw; - out_pos += out_hw; - } - } - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h b/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h deleted file mode 100644 index 9cdc22cff0..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BOXCODER_OP -#pragma once - -#include -#include "framework/tensor.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void EncodeCenterSize(const framework::Tensor& target_box, - const framework::Tensor& prior_box, - const framework::Tensor& prior_box_var, T* output) { - int64_t row = target_box.dims()[0]; - int64_t col = prior_box.dims()[0]; - int64_t len = prior_box.dims()[1]; - auto* target_box_data = target_box.data(); - auto* prior_box_data = prior_box.data(); - auto* prior_box_var_data = prior_box_var.data(); - - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len]; - T prior_box_height = - prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; - T prior_box_center_x = - (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; - T prior_box_center_y = - (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; - - T target_box_center_x = - (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; - T target_box_center_y = - (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; - T target_box_width = - target_box_data[i * len + 2] - target_box_data[i * len]; - T target_box_height = - target_box_data[i * len + 3] - target_box_data[i * len + 1]; - - size_t offset = i * col * len + j * len; - output[offset] = (target_box_center_x - prior_box_center_x) / - prior_box_width / prior_box_var_data[j * len]; - output[offset + 1] = (target_box_center_y - prior_box_center_y) / - prior_box_height / prior_box_var_data[j * len + 1]; - output[offset + 2] = - std::log(std::fabs(target_box_width / prior_box_width)) / - prior_box_var_data[j * len + 2]; - output[offset + 3] = - std::log(std::fabs(target_box_height / prior_box_height)) / - prior_box_var_data[j * len + 3]; - } - } -} - -template -void DecodeCenterSize(const framework::Tensor& target_box, - const framework::Tensor& prior_box, - const framework::Tensor& prior_box_var, T* output) { - int64_t row = target_box.dims()[0]; - int64_t col = prior_box.dims()[0]; - int64_t len = prior_box.dims()[1]; - - auto* target_box_data = target_box.data(); - auto* prior_box_data = prior_box.data(); - auto* prior_box_var_data = prior_box_var.data(); - - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - size_t offset = i * col * len + j * len; - T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len]; - T prior_box_height = - prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; - T prior_box_center_x = - (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; - T prior_box_center_y = - (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; - - T target_box_center_x = prior_box_var_data[j * len] * - target_box_data[offset] * prior_box_width + - prior_box_center_x; - T target_box_center_y = prior_box_var_data[j * len + 1] * - target_box_data[offset + 1] * - prior_box_height + - prior_box_center_y; - T target_box_width = std::exp(prior_box_var_data[j * len + 2] * - target_box_data[offset + 2]) * - prior_box_width; - T target_box_height = std::exp(prior_box_var_data[j * len + 3] * - target_box_data[offset + 3]) * - prior_box_height; - - output[offset] = target_box_center_x - target_box_width / 2; - output[offset + 1] = target_box_center_y - target_box_height / 2; - output[offset + 2] = target_box_center_x + target_box_width / 2; - output[offset + 3] = target_box_center_y + target_box_height / 2; - } - } -} - -template -void BoxCoderCompute(const BoxCoderParam& param) { - const auto* input_priorbox = param.InputPriorBox(); - const auto* input_priorboxvar = param.InputPriorBoxVar(); - const auto* input_targetbox = param.InputTargetBox(); - - const auto& code_type = param.CodeType(); - - auto row = input_targetbox->dims()[0]; - auto col = input_priorbox->dims()[0]; - auto len = input_priorbox->dims()[1]; - - framework::Tensor* output_box = param.OutputBox(); - auto* output_box_dataptr = output_box->mutable_data({row, col, len}); - - if (code_type == "encode_center_size") { - EncodeCenterSize(*input_targetbox, *input_priorbox, - *input_priorboxvar, output_box_dataptr); - } - if (code_type == "decode_center_size") { - DecodeCenterSize(*input_targetbox, *input_priorbox, - *input_priorboxvar, output_box_dataptr); - } -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h b/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h deleted file mode 100644 index 4b22857302..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONCAT_OP -#pragma once - -#include - -namespace paddle_mobile { -namespace operators { -template -class ConcatFunctor { - public: - void operator()(const std::vector &input, const int axis, - framework::Tensor *output) { - size_t num = input.size(); - int rows = 1; - auto dim_0 = input[0].dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int out_rows = rows, out_cols = 0; - - std::vector input_cols(input.size()); - for (int i = 0; i < num; ++i) { - int t_cols = input[i].numel() / rows; - out_cols += t_cols; - input_cols[i] = t_cols; - } - - // computation - for (int k = 0; k < out_rows; ++k) { - T *dst_ptr = output->data() + k * out_cols; - int col_idx = 0; - for (int j = 0; j < num; ++j) { - int col_len = input_cols[j]; - const T *src_prt = input[j].data() + k * col_len; - memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len); - col_idx += col_len; - } - } - } -}; - -template -void ConcatCompute(const ConcatParam ¶m) { - auto inputs = param.Inputs(); - auto *out = param.Out(); - int axis = param.Axis(); - out->mutable_data

(); - - /// Sometimes direct copies will be faster, this maybe need deeply analysis. - if (axis == 0 && inputs.size() < 10) { - size_t output_offset = 0; - for (auto *in : inputs) { - auto in_stride = framework::stride_numel(in->dims()); - auto out_stride = framework::stride_numel(out->dims()); - auto dst = out->data

() + output_offset; - auto src = in->data

(); - PADDLE_MOBILE_ENFORCE( - in_stride.size() == out_stride.size(), - "src and dst tensor should have the same dims size."); - memory::Copy(dst, src, sizeof(P) * in_stride[0]); - output_offset += in_stride[0]; - } - } else { - std::vector inputs_concat(inputs.size()); - for (int j = 0; j < inputs.size(); ++j) { - inputs_concat[j] = *inputs[j]; - } - ConcatFunctor

concat_functor; - concat_functor(inputs_concat, axis, out); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h deleted file mode 100644 index 0051fc9ae8..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP -#pragma once - -#include -#include "operators/math/conv_func.h" -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -void ConvAddBasic(const FusionConvAddParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor bias = *param.Bias(); - Tensor *output = param.Output(); - output->mutable_data(); - float *biase_data = bias.data(); - - int axis = param.Axis(); - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::MatMul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(1), false, biase_data); - } - } -} - -template -void ConvAddCompute(const FusionConvAddParam ¶m) { - param.Output()->mutable_data(); - if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.paddings_[0] == 1) { - math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), - param.Bias(), true, false); - } else if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - // math::DepthwiseConv3x3(param.Input(), param.Strides(), - // param.Paddings(), - // param.Filter(), param.Bias(), - // param.Output(), false); - if (param.Paddings()[0] == 0) { - math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), - param.Bias(), true, false); - } else { - math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), param.Bias(), true, false); - } - } else { - ConvAddBasic(param); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h deleted file mode 100644 index 5ee1e251d9..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBNRELU_OP - -#pragma once - -#include -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -void ConvAddBNReluBasic(const FusionConvAddBNReluParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor new_bias = *param.NewBias(); - Tensor new_scale = *param.NewScale(); - Tensor *output = param.Output(); - output->mutable_data(); - - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - math::MatMulWithBn(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0), true, &new_scale, &new_bias, g); - } - } -} - -template -void ConvAddBNReluCompute(const FusionConvAddBNReluParam ¶m) { - if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.paddings_[0] == 1) { - math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), true); - } else if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), true); - } else { - ConvAddBNReluBasic(param); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h deleted file mode 100644 index 9f8e885a31..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h +++ /dev/null @@ -1,154 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDRELU_OP - -#pragma once -#include -#include -#include "operators/math/conv_func.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void ConvAddReluBasic(const FusionConvAddReluParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor bias = *param.Bias(); - int32_t axis = param.Axis(); - Otype *bias_data = bias.data(); - Tensor *output = param.Output(); - output->mutable_data(); - - float alpha = 1.0f; - float beta = 1.0f; - int32_t groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int32_t batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int32_t in_step = static_cast(input->dims()[1]) / groups; - int32_t out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int32_t i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int32_t g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - math::MatMul(filter_slice, false, col_matrix, false, alpha, - &out_slice, beta, true, bias_data); - } - } -} - -template -void ConvAddReluCompute(const FusionConvAddReluParam ¶m) { - param.Output()->mutable_data(); - if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.paddings_[0] == 1) { - math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), - param.Bias(), true, true); - } else if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - // math::DepthwiseConv3x3(param.Input(), param.Strides(), - // param.Paddings(), - // param.Filter(), param.Bias(), - // param.Output(), false); - if (param.Paddings()[0] == 0) { - math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), - param.Bias(), true, true); - } else { - math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), param.Bias(), true, true); - } - } else { - ConvAddReluBasic(param); - } -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp deleted file mode 100644 index 606a7f1ddc..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp +++ /dev/null @@ -1,379 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef CONV_OP - -#include "operators/kernel/central-arm-func/conv_arm_func.h" -#include -#include "framework/context.h" -#include "operators/math/depthwise/faster_depthwise_conv3x3.h" -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/depthwise_conv5x5.h" -#include "operators/math/gemm/gemm1x1s1.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/pad.h" -#include "operators/math/slidingwindow_conv3x3.h" -#include "operators/math/vol2col.h" -#include "operators/math/winograd/winograd_transform.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -int ConvOutputSize(int input_size, int filter_size, int dilation, int padding, - int stride) { - const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = (input_size + 2 * padding - dkernel) / stride + 1; - return output_size; -} - -bool IsExpand(const std::vector &filter_dim, - const std::vector &strides, const std::vector &paddings, - const std::vector &dilations) { - bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; - for (size_t j = 0; j < strides.size(); ++j) { - filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); - strides_1 = strides_1 && (strides[j] == 1); - padding_0 = padding_0 && (paddings[j] == 0); - dilation_1 = dilation_1 && (dilations[j] == 1); - } - - return !(filter_1 && strides_1 && padding_0 && dilation_1); -} - -#ifdef PADDLE_MOBILE_CPU -template -void GemmConv(const ConvParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor *output = param.Output(); - output->mutable_data(); - - int groups = param.Groups(); - const std::vector strides = param.Strides(); - const std::vector paddings = param.Paddings(); - const std::vector dilations = param.Dilations(); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - const int batch_size = static_cast(input->dims()[0]); - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - // col_matrix.ShareDataWith(in_slice); - col_matrix = in_slice; - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - math::MatMul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0), false, - static_cast(nullptr)); - } - } -} - -template -void GemmConv1x1s1(const ConvParam ¶m, const float *bias, bool is_bias, - bool is_relu) { - const Tensor *input = param.Input(); - Tensor filter = *param.transformed_filter_; - Tensor *output = param.Output(); - output->mutable_data(); - - const float *din = input->data(); - float *dout = output->mutable_data(); - const int num = input->dims()[0]; - const int chin = input->dims()[1]; - const int hin = input->dims()[2]; - const int win = input->dims()[3]; - const int chout = output->dims()[1]; - const int hout = output->dims()[2]; - const int wout = output->dims()[3]; - const float *weights = filter.mutable_data(); - int channel_size_out = wout * hout; - int channel_size_in = win * hin; - const int group = param.Groups(); - const int m = chout / group; - const int n = hout * wout; - const int k = chin / group; - - bool flag_relu = true; - bool flag_bias = true; - - if (!is_bias) { - bias = nullptr; - flag_bias = false; - } - if (!is_relu) { - flag_relu = false; - } - ARMArch arch = framework::CPUContext::Context()->get_arch(); - int hblock = math::get_hblock(arch); - - int m_roundup = hblock * ((m + hblock - 1) / hblock); - int weights_size_per_group = m * k; - if (n > 1) { - weights_size_per_group = ((m_roundup * k + 15) / 16) * 16; - } - - for (int b = 0; b < num; ++b) { - // dC - for (int g = 0; g < group; ++g) { - float *dout_group = - static_cast(dout) + (b * chout + g * m) * channel_size_out; - const float *din_group = static_cast(din) + - (b * chin + g * k) * channel_size_in; - const float *weights_group = - static_cast(weights) + g * weights_size_per_group; - const float *bias_group = static_cast(bias) + g * m; - if (n > 1) { - math::sgemm_prepack(weights_group, din_group, bias_group, dout_group, m, - n, k, flag_bias, flag_relu, false, arch); - } - } - } -} - -template -void WinogradConv3x3(const ConvParam ¶m) { - const Tensor *input = param.Input(); - const Tensor *filter = param.transformed_filter_; - Tensor *output = param.Output(); - output->mutable_data(); - int batch_size = input->dims()[0]; - int groups = param.Groups(); - const std::vector &paddings = param.Paddings(); - - auto winograd_pad = [&](int width, int pad) { - int output_tile = tile - kernel + 1; - // int tiles = (width + pad - kernel) / output_tile + 1; - // return (tiles - 1) * output_tile + tile - width; - int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile; - return pad_width + tile - width; - }; - - math::PadFunctor pad; - Tensor input_pad; - framework::Tensor transformed_input; - for (int i = 0; i < batch_size; ++i) { - Tensor in_batch = input->Slice(i, i + 1); - Tensor out_batch = output->Slice(i, i + 1); - // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]); - // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]); - int pad_bottom = paddings[0]; - int pad_right = paddings[1]; - if (paddings[0] || paddings[1] || pad_bottom || pad_right) { - framework::DDim pad_shape = in_batch.dims(); - pad_shape[2] += paddings[0] + pad_bottom; - pad_shape[3] += paddings[1] + pad_right; - input_pad.mutable_data(pad_shape); - pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right, - &input_pad); - } else { - input_pad = in_batch; - } - // tile input and transform - math::winograd_transform_input(input_pad, &transformed_input); - // caculate output - math::winograd_transform_output(transformed_input, *filter, - output); - } -} - -template -void DepthwiseConv3x3(const ConvParam ¶m) { - const Tensor *input = param.Input(); - const Tensor *filter = param.Filter(); - const std::vector &paddings = param.Paddings(); - const std::vector &strides = param.Strides(); - const int batch_size = input->dims()[0]; - Tensor *output = param.Output(); - output->mutable_data(); - - if (strides[0] == 1) { - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1); - Tensor out_batch = output->Slice(i, i + 1); - math::DepthwiseConv3x3S1(in_batch, *filter, paddings, - &out_batch); - } - } else if (strides[0] == 2) { - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1); - Tensor out_batch = output->Slice(i, i + 1); - math::DepthwiseConv3x3S2(in_batch, *filter, paddings, - &out_batch); - } - } else { - GemmConv(param); - } -} - -void FasterDepthwiseConv3x3_bias_relu(const ConvParam ¶m, - const float *bias, bool flag_relu) { - const Tensor *input = param.Input(); - const Tensor *filter = param.Filter(); - const std::vector &paddings = param.Paddings(); - const std::vector &strides = param.Strides(); - const int batch_size = input->dims()[0]; - Tensor *output = param.Output(); - output->mutable_data(); - - int pad = paddings[0]; - int stride = strides[0]; - const float *din = input->data(); - float *dout = output->mutable_data(); - const float *weights = filter->data(); - const int num = input->dims()[0]; - const int chin = input->dims()[1]; - const int hin = input->dims()[2]; - const int win = input->dims()[3]; - const int chout = output->dims()[1]; - const int hout = output->dims()[2]; - const int wout = output->dims()[3]; - bool flag_bias = bias != nullptr; - if (pad == 1) { - math::depthwise::conv_depthwise_3x3p1(din, dout, num, chout, hout, wout, - chin, hin, win, weights, bias, stride, - flag_bias, flag_relu); - } -} - -template -void DepthwiseConv5x5(const ConvParam ¶m) { - const Tensor *input = param.Input(); - const Tensor *filter = param.Filter(); - const std::vector &paddings = param.Paddings(); - const std::vector &strides = param.Strides(); - const int batch_size = input->dims()[0]; - Tensor *output = param.Output(); - output->mutable_data(); - - if (strides[0] == 1) { - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1); - Tensor out_batch = output->Slice(i, i + 1); - math::DepthwiseConv5x5S1(in_batch, *filter, paddings, - &out_batch); - } - } else { - GemmConv(param); - } -} - -template -void SlidingwindowConv3x3(const ConvParam ¶m, const float *bias, - bool is_bias, bool is_relu) { - const Tensor *input = param.Input(); - const Tensor *filter = param.Filter(); - const std::vector &paddings = param.Paddings(); - const std::vector &strides = param.Strides(); - Tensor *output = param.Output(); - output->mutable_data(); - - if (strides[0] == 1) { - // math::SlidingwindowConv3x3s1(input, filter, paddings, - // output); - math::SlidingwindowConv3x3s1Faster( - input, param.transformed_filter_, paddings, output, bias, is_bias, - is_relu); - } else if (strides[0] == 2) { - // math::SlidingwindowConv3x3s2(input, filter, paddings, - // output); - math::SlidingwindowConv3x3s2Faster( - input, param.transformed_filter_, paddings, output, bias, is_bias, - is_relu); - } else { - GemmConv(param); - } -} - -template void GemmConv(const ConvParam ¶m); -template void GemmConv1x1s1(const ConvParam ¶m, - const float *bias, bool is_bias, - bool is_relu); -template void WinogradConv3x3<8, 3>(const ConvParam ¶m); -template void DepthwiseConv3x3(const ConvParam ¶m); -template void DepthwiseConv5x5(const ConvParam ¶m); -template void SlidingwindowConv3x3(const ConvParam ¶m, - const float *bias, - bool is_bias, bool is_relu); - -template void GemmConv(const ConvParam ¶m); -#ifndef __aarch64__ -template void DepthwiseConv3x3(const ConvParam ¶m); -template void DepthwiseConv5x5(const ConvParam ¶m); -#endif -#endif - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h deleted file mode 100644 index 89b91f9d11..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#pragma once - -#include -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -int ConvOutputSize(int input_size, int filter_size, int dilation, int padding, - int stride); - -bool IsExpand(const std::vector &filter_dim, - const std::vector &strides, const std::vector &paddings, - const std::vector &dilations); - -template -void GemmConv(const ConvParam ¶m); - -template -void GemmConv1x1s1(const ConvParam ¶m, const float *bias, bool is_bias, - bool is_relu); - -template -void WinogradConv3x3(const ConvParam ¶m); - -template -void DepthwiseConv3x3(const ConvParam ¶m); - -template -void DepthwiseConv5x5(const ConvParam ¶m); - -template -void SlidingwindowConv3x3(const ConvParam ¶m, const float *bias, - bool is_bias, bool is_relu); - -void FasterDepthwiseConv3x3_bias_relu(const ConvParam ¶m, - const float *bias, bool flag_relu); - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h deleted file mode 100644 index 1ff51aa39c..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNADDRELU_OP - -#pragma once - -#include -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -void ConvBNAddReluBasic(const FusionConvBNAddReluParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor new_bias = *param.NewBias(); - Tensor new_scale = *param.NewScale(); - Tensor *bias1 = param.Bias(); - Tensor *output = param.Output(); - output->mutable_data(); - - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step); - math::MatMulWithBn(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(1), true, &new_scale, &new_bias, g, - bias_data.data()); - } - } -} -template -void ConvBNAddReluCompute(const FusionConvBNAddReluParam ¶m) { - Tensor Bias; - Bias.mutable_data({param.Groups()}); - if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.paddings_[0] == 1) { - math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), true); - } else if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), - // param.Output(), param.NewScale(), - // param.NewBias(), 1); - math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), true); - } else { - ConvBNAddReluBasic(param); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h deleted file mode 100644 index 5606eb3304..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNRELU_OP - -#pragma once -#include -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -void ConvBNReluBasic(const FusionConvBNReluParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor new_bias = *param.NewBias(); - Tensor new_scale = *param.NewScale(); - - Tensor *output = param.Output(); - output->mutable_data(); - - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - math::MatMulWithBn(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0), true, &new_scale, &new_bias, g); - } - } -} - -template -void ConvBNReluCompute(const FusionConvBNReluParam ¶m) { - if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.paddings_[0] == 1) { - math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), true); - } else if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), - // param.Output(), param.NewScale(), - // param.NewBias(), 1); - math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), true); - } else { - ConvBNReluBasic(param); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h deleted file mode 100644 index 33ceefadd8..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef CONV_TRANSPOSE_OP - -#include -#include "framework/ddim.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void ConvTransposeCompute(const ConvTransposeParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor *output = param.Output(); - output->mutable_data

(); - - auto strides = param.Strides(); - auto paddings = param.Paddings(); - auto dilations = param.Dilations(); - auto groups = param.Groups(); - - const int batch_size = input->dims()[0]; - - std::vector input_shape_vec = framework::vectorize(input->dims()); - std::vector filter_shape_vec = framework::vectorize(filter.dims()); - - size_t data_dim = filter_shape_vec.size() - 2; - - // 5 或者 7 - std::vector col_shape_vec(1 + 2 * data_dim); - - // output c / groups - col_shape_vec[0] = output->dims()[1] / groups; - for (size_t i = 0; i < data_dim; ++i) { - // filter shape filter h filter w - col_shape_vec[i + 1] = filter_shape_vec[i + 2]; - // input shape input h input w - col_shape_vec[i + 1 + data_dim] = input_shape_vec[i + 2]; - } - - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - Tensor col; - col.mutable_data

(col_shape); - - Tensor col_matrix; - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - - framework::DDim output_shape = - framework::slice_ddim(output->dims(), 1, output->dims().size()); - - framework::DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; - - // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w) - framework::DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]}; - filter.Resize(filter_matrix_shape); - - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Col2ImFunctor col2im; - math::Col2VolFunctor col2vol; - - for (int i = 0; i < batch_size; ++i) { - Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); - Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); - - for (int g = 0; g < groups; ++g) { - Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step); - Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); - Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step); - - math::MatMul(filter_slice, true, in_slice, false, - static_cast

(1.0), &col_matrix, static_cast

(0.0)); - if (data_dim == 2U) { - col2im(col, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &out_slice); - } else if (data_dim == 3U) { - col2vol(col, dilations, strides, paddings, &out_slice); - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h b/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h deleted file mode 100644 index 2cf95081e9..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h +++ /dev/null @@ -1,118 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CRF_OP -#pragma once - -#include -#include -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -template -void Decode(const Tensor& emission_weights, const Tensor& transition_weights, - Tensor* decoded_path) { - auto emission_dims = emission_weights.dims(); - const size_t seq_len = emission_dims[0]; - const size_t tag_num = emission_dims[1]; - - const size_t state_trans_base_idx = 2; - - const P* x = emission_weights.data

(); - const P* w = transition_weights.data

(); - int64_t* path = decoded_path->data(); - - // alpha is a memo table. An element alpha(k, v) records the score of the - // best sequence of tags from position 1 to position k with v being the end - // tag. - Tensor alpha; - P* alpha_value = alpha.mutable_data

(emission_dims); - Tensor track; - int* track_value = track.mutable_data(emission_dims); - for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; - - for (size_t k = 1; k < seq_len; ++k) { - for (size_t i = 0; i < tag_num; ++i) { - P max_score = -std::numeric_limits

::max(); - int max_j = 0; - for (size_t j = 0; j < tag_num; ++j) { - P score = alpha_value[(k - 1) * tag_num + j] + - w[(j + state_trans_base_idx) * tag_num + i]; - if (score > max_score) { - max_score = score; - max_j = j; - } - } - - alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i]; - track_value[k * tag_num + i] = max_j; - } - } - P max_score = -std::numeric_limits

::max(); - int max_i = 0; - for (size_t i = 0; i < tag_num; ++i) { - P score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i]; - if (score > max_score) { - max_score = score; - max_i = i; - } - } - path[seq_len - 1] = max_i; - for (int k = seq_len - 1; k >= 1; --k) { - path[k - 1] = max_i = track_value[k * tag_num + max_i]; - } -} -template -void CrfCompute(const CrfParam& param) { - auto* emission = param.InputEmission(); - auto* transition = param.InputTransition(); - auto* label = param.InputLabel(); - auto* decoded_path = param.outputVBP(); - // DLOG<<*emission; - // DLOG<<*transition; - // DLOG<<*label; - - PADDLE_MOBILE_ENFORCE(emission->NumLevels() == 1U, - "The Input(Emission) should be a sequence."); - auto lod = emission->lod(); - PADDLE_MOBILE_ENFORCE(lod.size(), - "The Input(Emission) should be a sequence."); - const size_t level = 0; - const size_t seq_num = lod[level].size() - 1; - int64_t* path = decoded_path->mutable_data(); - int numel = decoded_path->numel(); - memset(static_cast(path), 0, sizeof(int64_t) * numel); - for (size_t i = 0; i < seq_num; ++i) { - int start_pos = static_cast(lod[level][i]); - int end_pos = static_cast(lod[level][i + 1]); - Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos); - Decode

(emission->Slice(start_pos, end_pos), *transition, - &decoded_path_one_seq); - } - if (label) { - PADDLE_MOBILE_ENFORCE(label->NumLevels() == 1U, - "The Input(Label) should be a sequence."); - const int64_t* label_value = label->data(); - size_t batch_size = emission->dims()[0]; - for (size_t i = 0; i < batch_size; ++i) { - path[i] = label_value[i] == path[i] ? 1 : 0; - } - } -} -} // namespace operators - -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h b/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h deleted file mode 100644 index 7e4c3599d0..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DENSITY_PRIORBOX_OP -#pragma once - -#include -#include -#include -#include - -namespace paddle_mobile { -namespace operators { - -template -struct ClipFunctor { - inline T operator()(T in) const { - return std::min(std::max(in, 0.), 1.); - } -}; - -template -void DensityPriorBoxCompute(const DensityPriorBoxParam ¶m) { - const auto *input_ = param.Input(); - const auto &input_dims = input_->dims(); - - const auto *input_image = param.InputImage(); - const auto &input_image_dims = input_image->dims(); - - auto densities = param.Densities(); - auto fixed_ratios = param.FixedRatios(); - - auto fixed_sizes = param.FixedSizes(); - - const auto &variances = param.Variances(); - const bool &clip = param.Clip(); - - const float &step_w = param.StepW(); - const float &step_h = param.StepH(); - const float &offset = param.Offset(); - - Tensor *output_boxes = param.OutputBoxes(); - auto output_boxes_dataptr = output_boxes->mutable_data(); - Tensor *output_variances = param.OutputVariances(); - auto output_variances_dataptr = output_variances->mutable_data(); - - auto img_width = input_image_dims[3]; - auto img_height = input_image_dims[2]; - - auto feature_width = input_dims[3]; - auto feature_height = input_dims[2]; - - auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] * - output_boxes->dims()[3]; - auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3]; - auto stride2 = output_boxes->dims()[3]; - - float step_width, step_height; - /// 300 / 19 - if (step_w == 0 || step_h == 0) { - step_width = static_cast(img_width) / feature_width; - step_height = static_cast(img_height) / feature_height; - } else { - step_width = step_w; - step_height = step_h; - } - - int num_priors = 0; - for (size_t i = 0; i < densities.size(); ++i) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - - auto box_dim = output_variances->dims(); - - output_boxes->Resize({feature_height, feature_width, num_priors, 4}); - int step_average = static_cast((step_width + step_height) * 0.5); - - std::vector sqrt_fixed_ratios; - for (size_t i = 0; i < fixed_ratios.size(); i++) { - sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); - } - - for (int h = 0; h < feature_height; ++h) { - for (int w = 0; w < feature_width; ++w) { - /// map origin image - float center_x = (w + offset) * step_width; - float center_y = (h + offset) * step_height; - int idx = 0; - for (size_t s = 0; s < fixed_sizes.size(); ++s) { - auto fixed_size = fixed_sizes[s]; - int density = densities[s]; - int shift = step_average / density; - // Generate density prior boxes with fixed ratios. - for (size_t r = 0; r < fixed_ratios.size(); ++r) { - float box_width_ratio = fixed_size * sqrt_fixed_ratios[r]; - float box_height_ratio = fixed_size / sqrt_fixed_ratios[r]; - float density_center_x = center_x - step_average / 2. + shift / 2.; - float density_center_y = center_y - step_average / 2. + shift / 2.; - for (int di = 0; di < density; ++di) { - for (int dj = 0; dj < density; ++dj) { - float center_x_temp = density_center_x + dj * shift; - float center_y_temp = density_center_y + di * shift; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = - std::max((center_x_temp - box_width_ratio / 2.) / img_width, - 0.); - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = - std::max((center_y_temp - box_height_ratio / 2.) / img_height, - 0.); - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = - std::min((center_x_temp + box_width_ratio / 2.) / img_width, - 1.); - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = - std::min((center_y_temp + box_height_ratio / 2.) / img_height, - 1.); - idx++; - } - } - } - } - } - } - if (clip) { - math::Transform trans; - ClipFunctor clip_func; - trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(), - output_boxes_dataptr, clip_func); - } - - if ((variances.size() != 4)) { - LOG(kLOG_ERROR) << " variances.size() must be 4."; - } - - int64_t box_num = feature_height * feature_width * num_priors; - - for (int i = 0; i < box_num; i++) { - output_variances_dataptr[4 * i] = variances[0]; - output_variances_dataptr[4 * i + 1] = variances[1]; - output_variances_dataptr[4 * i + 2] = variances[2]; - output_variances_dataptr[4 * i + 3] = variances[3]; - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h deleted file mode 100644 index 1504850324..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DWCONVBNRELU_OP - -#pragma once -#include -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -void DWConvBNReluBasic(const FusionDWConvBNReluParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor new_bias = *param.NewBias(); - Tensor new_scale = *param.NewScale(); - - Tensor *output = param.Output(); - output->mutable_data(); - - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::MatMulWithBn(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0), true, &new_scale, &new_bias, g); - } - } -} -template -void DWConvBNReluCompute(const FusionDWConvBNReluParam ¶m) { - if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.paddings_[0] == 1) { - math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), true); - } else if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), - // param.Output(), param.NewScale(), - // param.NewBias(), 1); - math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), true); - } else { - DWConvBNReluBasic(param); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h deleted file mode 100644 index 877ae712cf..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEADD_OP - -#pragma once - -#include "operators/math/element_wise.h" -#include "operators/op_param.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif - -namespace paddle_mobile { -namespace operators { - -template -inline void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { - const framework::Tensor *input_x = param.InputX(); - const framework::Tensor *input_y = param.InputY(); - framework::Tensor *output = param.Out(); - int axis = param.Axis(); - math::AddElememtWise(input_x, input_y, axis, output); -} - -template -struct AddElememtWiseStruct { - void operator()(const Tensor *X, const Tensor *Y, const int Axis, - Tensor *Out) {} -}; - -template -struct AddElememtWiseStruct { - void operator()(const Tensor *input, const Tensor *bias, const int Axis, - Tensor *output) { - const auto &x_dims = input->dims(); - const auto &y_dims = bias->dims(); - const int *input_data = input->data(); - const int *bias_data = bias->data(); - int *output_data = output->mutable_data(); - - if (x_dims == y_dims) { - size_t channels = 1; - size_t elementwise_num = 1; - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } -#pragma omp parallel for - for (int j = 0; j < channels; ++j) { - size_t offset = (0 * channels + j) * elementwise_num; - const int *input = input_data + offset; - const int bias = bias_data[j]; - int *output = output_data + offset; - for (int k = 0; k < elementwise_num; ++k) { - output[k] = math::Active(input[k] + bias); - } - } - } - } -}; - -template class ElementwiseAddKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h deleted file mode 100644 index 0aed7ff8d4..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEMUL_OP - -#pragma once -#include "operators/math/elementwise_op_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -struct MulFunctor { - inline T operator()(T a, T b) const { return a * b; } -}; - -template -void ElementwiseMulCompute(const ElementwiseMulParam ¶m) { - const Tensor *input_x = param.InputX(); - const Tensor *input_y = param.InputY(); - Tensor *Out = param.Out(); - Out->mutable_data(); - int axis = param.Axis(); - ElementwiseComputeEx, float>(input_x, input_y, axis, - MulFunctor(), Out); -} - -template class ElementwiseMulKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h deleted file mode 100644 index cb5bbc91c3..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISESUB_OP - -#pragma once - -#include "framework/data_type.h" -#include "operators/math/elementwise_op_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -struct SubFunctor { - inline T operator()(T a, T b) const { return a - b; } -}; - -struct SubOpFunctor { - const framework::Tensor* x_; - const framework::Tensor* y_; - const int axis_; - framework::Tensor* out_; - - SubOpFunctor(const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* out, const int axis) - : x_(x), y_(y), out_(out), axis_(axis) {} - - template - void apply() const { - out_->mutable_data(); - ElementwiseComputeEx, T>(x_, y_, axis_, SubFunctor(), - out_); - } -}; - -template -void ElementwiseSubCompute(const ElementwiseSubParam& param) { - const Tensor* input_x = param.InputX(); - const Tensor* input_y = param.InputY(); - Tensor* out = param.Out(); - - int axis = param.Axis(); - framework::VisitDataType(framework::ToDataType(input_x->type()), - SubOpFunctor(input_x, input_y, out, axis)); -} - -template class ElementwiseSubKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h b/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h deleted file mode 100644 index 3966580133..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FLATTEN_OP - -#ifndef RESHAPE_OP -#define RESHAPE_OP -#endif - -#pragma once - -#include -#include -#include "operators/flatten_op.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void FlattenCompute(const FlattenParam ¶m) { - const auto *input_x = param.InputX(); - const auto axis = param.Axis(); - const auto &input_x_dims = input_x->dims(); - auto *out = param.Out(); - - const auto &out_shape_v = GetOutputShape(axis, input_x_dims); - const framework::DDim &out_dim = ValidateShape(out_shape_v, input_x_dims); - - out->Resize(out_dim); - out->mutable_data(); - framework::TensorCopy(*input_x, out); - out->Resize(out_dim); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h deleted file mode 100644 index 9adc4a273a..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_FC_OP - -#pragma once - -#include -#include "operators/math/math_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionFcCompute(const FusionFcParam ¶m) { - const Tensor *input_x = param.InputX(); - const Tensor *input_y = param.InputY(); - Tensor *input_z = param.InputZ(); - Otype *input_z_data = input_z->data(); - int axis = param.Axis(); - Tensor *out = param.Out(); - auto *out_data = out->mutable_data(); - int M = (int)input_x->dims()[0]; - - const Tensor x_matrix = - input_x->dims().size() > 2 - ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) - : *input_x; - const Tensor y_matrix = - input_y->dims().size() > 2 - ? framework::ReshapeToMatrix(*input_y, param.YNumColDims()) - : *input_y; - auto out_dim = out->dims(); - if (out_dim.size() != 2) { - out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); - } - PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); - PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1"); - PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0], - " out_dim.size must be 2."); - axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis); - PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. "); - - // bias_data的维度和out的第二个维度一致 - int64_t classes = input_z->numel(); - for (int i = 0; i < out_dim[0]; i++) { - memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes); - } - if (M == 1) { - math::MatMul(x_matrix, false, y_matrix, true, - static_cast(1), out, - static_cast(1), false); - } else { - math::MatMul(x_matrix, false, y_matrix, false, - static_cast(1), out, - static_cast(1), false); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h b/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h deleted file mode 100644 index 8975382732..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_OP -#pragma once - -#include -#include -#include "common/types.h" -#include "operators/math/gru_compute.h" -#include "operators/math/math_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -inline void ReorderInitState(const framework::Tensor& src, - std::vector index_lod, - framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; - dst->mutable_data(src.dims()); - row_shuffle(src, index_lod, dst, indexed_src); -} - -template -void GruCompute(const GruParam& param) { - auto* input = param.InputInput(); - auto* h0 = param.InputH0(); - auto* weight = param.InputWeight(); - const auto* weight_data = weight->data(); - auto* bias = param.InputBias(); - auto* batch_gate = param.OutBatchGate(); - batch_gate->mutable_data(); - auto* batch_reset_hidden_prev = param.OutBatchResetHiddenPrev(); - batch_reset_hidden_prev->mutable_data(); - auto* batch_hidden = param.OutBatchHidden(); - batch_hidden->mutable_data(); - auto* hidden = param.OutHidden(); - hidden->mutable_data(); - - auto hidden_dims = hidden->dims(); - - bool is_reverse = param.IsReverse(); - math::LoDTensor2BatchFunctor to_batch; - to_batch(*input, batch_gate, true, is_reverse); - if (bias) { - math::RowwiseAdd add_bias; - add_bias(*batch_gate, *bias, batch_gate); - } - int frame_size = hidden_dims[1]; - math::GRUMetaValue gru_value; - gru_value.gate_weight = const_cast(weight_data); - gru_value.state_weight = - const_cast(weight_data + 2 * frame_size * frame_size); - framework::Tensor ordered_h0; - std::vector order(batch_gate->lod()[2]); - if (h0) { - // Since the batch computing for GRU reorders the input sequences - // according to their length. The initialized cell state also needs - // to reorder. - ReorderInitState(*h0, order, &ordered_h0, true); - gru_value.prev_out_value = ordered_h0.data(); - } else { - gru_value.prev_out_value = nullptr; - } - auto batch_starts = batch_gate->lod()[0]; - size_t seq_len = batch_starts.size() - 1; - auto active_node = math::GetActivationType(param.Activation()); - auto active_gate = math::GetActivationType(param.GateActivation()); - for (size_t n = 0; n < seq_len; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - int cur_batch_size = bend - bstart; - framework::Tensor gate_t = batch_gate->Slice(bstart, bend); - framework::Tensor reset_hidden_prev_t = - batch_reset_hidden_prev->Slice(bstart, bend); - framework::Tensor hidden_t = batch_hidden->Slice(bstart, bend); - gru_value.output_value = hidden_t.data(); - gru_value.gate_value = gate_t.data(); - gru_value.reset_output_value = reset_hidden_prev_t.data(); - - math::GRUUnitFunctor::compute( - gru_value, frame_size, cur_batch_size, active_node, active_gate); - - gru_value.prev_out_value = gru_value.output_value; - } - math::Batch2LoDTensorFunctor to_seq; - batch_hidden->set_lod(batch_gate->lod()); - to_seq(*batch_hidden, hidden); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // GRU_OP diff --git a/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h b/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h deleted file mode 100644 index 568273e873..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_UNIT_OP - -#pragma once - -#include -#include "operators/kernel/activation_kernel.h" -#include "operators/math/gemm.h" -#include "operators/math/math_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void GruUnitCompute(const GruUnitParam& param) { - // inputs - auto* input = param.InputInput(); - auto* hidden_prev = param.InputHiddenPrev(); - auto* weight = param.InputWeight(); - auto* bias = param.InputBias(); - // outputs - auto* gate = param.OutGate(); - gate->mutable_data

(); - auto* reset_hidden_prev = param.OutResetHiddenPrev(); - reset_hidden_prev->mutable_data

(); - auto* hidden = param.OutHidden(); - hidden->mutable_data

(); - - // add bias - if (bias) { - math::RowwiseAdd add_bias; - add_bias(*input, *bias, gate); - } - - int batch_size = input->dims()[0]; - int frame_size = hidden_prev->dims()[1]; - const P* weight_data = weight->data

(); - - math::GRUMetaValue

gru_value; - gru_value.gate_weight = const_cast(weight_data); - gru_value.state_weight = - const_cast(weight_data + 2 * frame_size * frame_size); - gru_value.prev_out_value = const_cast(hidden_prev->data

()); - - gru_value.output_value = hidden->data

(); - gru_value.gate_value = gate->data

(); - gru_value.reset_output_value = reset_hidden_prev->data

(); - - auto active_node = math::GetActivationType(param.Activation()); - auto active_gate = math::GetActivationType(param.GateActivation()); - math::GRUUnitFunctor::compute(gru_value, frame_size, batch_size, - active_node, active_gate); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h b/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h deleted file mode 100644 index 96473fef81..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INCREMENT_OP - -#pragma once - -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void IncrementCompute(const IncrementParam ¶m) { - const framework::Tensor *input = param.InputX(); - framework::Tensor *out = param.Out(); - float step = param.Step(); - - out->mutable_data(); - const int64_t *input_data = input->data(); - int64_t *out_data = out->data(); - *out_data = *input_data + step; -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h b/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h deleted file mode 100644 index 917973822f..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LOOKUP_OP -#pragma once - -#include -#include "framework/ddim.h" -#include "operators/op_param.h" - -constexpr int64_t kNoPadding = -1; - -namespace paddle_mobile { -namespace operators { - -template -void LookupCompute(const LookupParam ¶m) { - auto *ids_t = param.InputIds(); - auto *table_t = param.InputW(); - auto *output_t = param.Out(); - int64_t padding_idx = param.PaddingIdx(); - const framework::DDim &table_dim = table_t->dims(); - int64_t ids_numel; - const auto *ids = ids_t->data(); - ids_numel = ids_t->numel(); - int64_t row_number = table_t->dims()[0]; - int64_t row_width = table_t->dims()[1]; - auto *table = table_t->data(); - auto *output = output_t->mutable_data(); - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(float)); - } else { - PADDLE_MOBILE_ENFORCE(ids[i] < row_number, - "look uptable ids[i] = 0, - "lookuptable ids[i] >= 0 check failed"); - - memcpy(output + i * row_width, table + ids[i] * row_width, - row_width * sizeof(float)); - } - } -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h b/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h deleted file mode 100644 index 165ad8dd8a..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LRN_OP - -#pragma once -#include "operators/op_param.h" -namespace paddle_mobile { -namespace operators { - -template -void LrnCompute(const LrnParam ¶m) { - const Tensor *input_x = param.InputX(); - auto x_dims = input_x->dims(); - Tensor *out = param.Out(); - out->mutable_data(); - /// data_format = NCHW - const int N = x_dims[0]; - const int C = x_dims[1]; - const int H = x_dims[2]; - const int W = x_dims[3]; - - const int n = param.N(); - const float alpha = param.Alpha(); - const float beta = param.Beta(); - const float k = param.K(); - LRNFunctor lrnFunctor; - lrnFunctor(*input_x, out, N, C, H, W, n, k, alpha, beta); -} - -template class LrnKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h b/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h deleted file mode 100644 index 01d668021b..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MUL_OP - -#pragma once - -namespace paddle_mobile { -namespace operators { - -template -void MulCompute(const MulParam ¶m) { - const Tensor *input_x = param.InputX(); - const Tensor *input_y = param.InputY(); - Tensor *out = param.Out(); - - const Tensor x_matrix = - input_x->dims().size() > 2 - ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) - : *input_x; - const Tensor y_matrix = - input_y->dims().size() > 2 - ? framework::ReshapeToMatrix(*input_y, param.YNumColDims()) - : *input_y; - auto out_dim = out->dims(); - if (out_dim.size() != 2) { - out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); - } - if (param.InputX()->type() == type_id().hash_code()) { - out->mutable_data(); - math::MatMul(x_matrix, false, y_matrix, false, - static_cast(1), out, - static_cast(0)); - } else { - out->mutable_data(); - math::MatMul(x_matrix, false, y_matrix, false, - static_cast(1), out, - static_cast(0)); - } - if (out_dim.size() != 2) { - out->Resize(out_dim); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h b/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h deleted file mode 100644 index f44f348aa6..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h +++ /dev/null @@ -1,307 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP -#pragma once - -#include -#include -#include -#include -#include "framework/tensor.h" -#include "operators/math/poly_util.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { - return pair1.first > pair2.first; -} - -template -static inline void GetMaxScoreIndex( - const std::vector& scores, const T threshold, int top_k, - std::vector>* sorted_indices) { - for (size_t i = 0; i < scores.size(); ++i) { - if (scores[i] > threshold) { - sorted_indices->push_back(std::make_pair(scores[i], i)); - } - } - // Sort the score pair according to the scores in descending order - std::stable_sort(sorted_indices->begin(), sorted_indices->end(), - SortScorePairDescend); - // Keep top_k scores if needed. - if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { - sorted_indices->resize(top_k); - } -} - -template -static inline T BBoxArea(const T* box, const bool normalized) { - if (box[2] < box[0] || box[3] < box[1]) { - // If coordinate values are is invalid - // (e.g. xmax < xmin or ymax < ymin), return 0. - return static_cast(0.); - } else { - const T w = box[2] - box[0]; - const T h = box[3] - box[1]; - if (normalized) { - return w * h; - } else { - // If coordinate values are not within range [0, 1]. - return (w + 1) * (h + 1); - } - } -} - -template -static inline T JaccardOverlap(const T* box1, const T* box2, - const bool normalized) { - if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || - box2[3] < box1[1]) { - return static_cast(0.); - } else { - const T inter_xmin = std::max(box1[0], box2[0]); - const T inter_ymin = std::max(box1[1], box2[1]); - const T inter_xmax = std::min(box1[2], box2[2]); - const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; - const T inter_area = inter_w * inter_h; - const T bbox1_area = BBoxArea(box1, normalized); - const T bbox2_area = BBoxArea(box2, normalized); - return inter_area / (bbox1_area + bbox2_area - inter_area); - } -} - -template -static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size, - const bool normalized) { - T bbox1_area = math::PolyArea(box1, box_size, normalized); - T bbox2_area = math::PolyArea(box2, box_size, normalized); - T inter_area = math::PolyOverlapArea(box1, box2, box_size, normalized); - if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) { - // If coordinate values are is invalid - // if area size <= 0, return 0. - return static_cast(0.); - } else { - return inter_area / (bbox1_area + bbox2_area - inter_area); - } -} - -template -static inline void NMSFast(const framework::Tensor& bbox, - const framework::Tensor& scores, - const T score_threshold, const T nms_threshold, - const T eta, const int64_t top_k, - std::vector* selected_indices) { - // The total boxes for each instance. - int64_t num_boxes = bbox.dims()[0]; - // 4: [xmin ymin xmax ymax] - int64_t box_size = bbox.dims()[1]; - - std::vector scores_data(num_boxes); - std::copy_n(scores.data(), num_boxes, scores_data.begin()); - std::vector> sorted_indices; - GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); - - selected_indices->clear(); - T adaptive_threshold = nms_threshold; - const T* bbox_data = bbox.data(); - - while (sorted_indices.size() != 0) { - const int idx = sorted_indices.front().second; - bool keep = true; - for (size_t k = 0; k < selected_indices->size(); ++k) { - if (keep) { - const int kept_idx = (*selected_indices)[k]; - T overlap = T(0.); - if (box_size == 4) { - overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, true); - } else { - overlap = PolyIoU(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, box_size, true); - } - keep = overlap <= adaptive_threshold; - } else { - break; - } - } - if (keep) { - selected_indices->push_back(idx); - } - sorted_indices.erase(sorted_indices.begin()); - if (keep && eta < 1 && adaptive_threshold > 0.5) { - adaptive_threshold *= eta; - } - } -} - -template -void MultiClassNMS(const framework::Tensor& scores, - const framework::Tensor& bboxes, - std::map>* indices, int* num_nmsed_out, - const int& background_label, const int& nms_top_k, - const int& keep_top_k, const T& nms_threshold, - const T& nms_eta, const T& score_threshold) { - int64_t class_num = scores.dims()[0]; - int64_t predict_dim = scores.dims()[1]; - int num_det = 0; - for (int64_t c = 0; c < class_num; ++c) { - if (c == background_label) continue; - framework::Tensor score = scores.Slice(c, c + 1); - /// [c] is key - NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, - nms_top_k, &((*indices)[c])); - num_det += (*indices)[c].size(); - } - - *num_nmsed_out = num_det; - const T* scores_data = scores.data(); - if (keep_top_k > -1 && num_det > keep_top_k) { - std::vector>> score_index_pairs; - for (const auto& it : *indices) { - int label = it.first; - const T* sdata = scores_data + label * predict_dim; - const std::vector& label_indices = it.second; - for (size_t j = 0; j < label_indices.size(); ++j) { - int idx = label_indices[j]; - // PADDLE_ENFORCE_LT(idx, predict_dim); - score_index_pairs.push_back( - std::make_pair(sdata[idx], std::make_pair(label, idx))); - } - } - // Keep top k results per image. - std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), - SortScorePairDescend>); - score_index_pairs.resize(keep_top_k); - - // Store the new indices. - std::map> new_indices; - for (size_t j = 0; j < score_index_pairs.size(); ++j) { - int label = score_index_pairs[j].second.first; - int idx = score_index_pairs[j].second.second; - new_indices[label].push_back(idx); - } - new_indices.swap(*indices); - *num_nmsed_out = keep_top_k; - } -} - -template -void MultiClassOutput(const framework::Tensor& scores, - const framework::Tensor& bboxes, - const std::map>& selected_indices, - framework::Tensor* outs) { - int predict_dim = scores.dims()[1]; - int box_size = bboxes.dims()[1]; - int out_dim = bboxes.dims()[1] + 2; - auto* scores_data = scores.data(); - auto* bboxes_data = bboxes.data(); - auto* odata = outs->data(); - - int count = 0; - for (const auto& it : selected_indices) { - /// one batch - int label = it.first; - const T* sdata = scores_data + label * predict_dim; - const std::vector& indices = it.second; - for (size_t j = 0; j < indices.size(); ++j) { - int idx = indices[j]; - const T* bdata = bboxes_data + idx * box_size; - odata[count * out_dim] = label; // label - odata[count * out_dim + 1] = sdata[idx]; // score - // xmin, ymin, xmax, ymax - std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); - count++; - } - } -} - -template -void MultiClassNMSCompute(const MultiClassNMSParam& param) { - const auto* input_bboxes = param.InputBBoxes(); - const auto& input_bboxes_dims = input_bboxes->dims(); - - const auto* input_scores = param.InputScores(); - const auto& input_scores_dims = input_scores->dims(); - - auto* outs = param.Out(); - auto background_label = param.BackGroundLabel(); - auto nms_top_k = param.NMSTopK(); - auto keep_top_k = param.KeepTopK(); - auto nms_threshold = param.NMSThreshold(); - auto nms_eta = param.NMSEta(); - auto score_threshold = param.ScoreThreshold(); - - int64_t batch_size = input_scores_dims[0]; - int64_t class_num = input_scores_dims[1]; - int64_t predict_dim = input_scores_dims[2]; - int64_t box_dim = input_bboxes_dims[2]; - - std::vector>> all_indices; - std::vector batch_starts = {0}; - for (int64_t i = 0; i < batch_size; ++i) { - framework::Tensor ins_score = input_scores->Slice(i, i + 1); - ins_score.Resize({class_num, predict_dim}); - - framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1); - ins_boxes.Resize({predict_dim, box_dim}); - - std::map> indices; - int num_nmsed_out = 0; - MultiClassNMS(ins_score, ins_boxes, &indices, &num_nmsed_out, - background_label, nms_top_k, keep_top_k, nms_threshold, - nms_eta, score_threshold); - all_indices.push_back(indices); - batch_starts.push_back(batch_starts.back() + num_nmsed_out); - } - - int num_kept = batch_starts.back(); - if (num_kept == 0) { - float* od = outs->mutable_data({1}); - od[0] = -1; - } else { - int64_t out_dim = box_dim + 2; - outs->mutable_data({num_kept, out_dim}); - for (int64_t i = 0; i < batch_size; ++i) { - framework::Tensor ins_score = input_scores->Slice(i, i + 1); - ins_score.Resize({class_num, predict_dim}); - - framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1); - ins_boxes.Resize({predict_dim, box_dim}); - - int64_t s = batch_starts[i]; - int64_t e = batch_starts[i + 1]; - if (e > s) { - framework::Tensor out = outs->Slice(s, e); - MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out); - } - } - } - - framework::LoD lod; - lod.emplace_back(batch_starts); - - outs->set_lod(lod); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h b/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h deleted file mode 100644 index 71b4c5515e..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NORM_OP - -#pragma once - -#include -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -inline void GetDims(const framework::DDim &dim, int axis, int *pre, int *n, - int *post) { - *pre = 1; - *post = 1; - *n = dim[axis]; - for (int i = 0; i < axis; ++i) { - (*pre) *= dim[i]; - } - for (int i = axis + 1; i < dim.size(); ++i) { - (*post) *= dim[i]; - } -} - -template -void NormCompute(const NormParam ¶m) { - const float epsilon = param.Epsilon(); - int axis = param.Axis(); - - const framework::Tensor *input = param.InputX(); - framework::Tensor *norm = param.OutputNorm(); - framework::Tensor *out = param.Out(); - - auto x_dims = input->dims(); - if (axis < 0) { - axis += x_dims.size(); - } - - int pre, n, post; - GetDims(x_dims, axis, &pre, &n, &post); - - const float *input_ptr = input->data(); - float *norm_ptr = norm->mutable_data(); - float *out_ptr = out->mutable_data(); - - for (int p = 0; p < pre; ++p) { - const float *in_tmp = input_ptr + p * n * post; - float *norm_tmp = norm_ptr + p * post; - - // in_ch = 0; norm = epsilon + x * x - for (int i = 0; i < post; ++i) { - *norm_tmp = epsilon; - *norm_tmp += (*in_tmp) * (*in_tmp); - norm_tmp++; - in_tmp++; - } - - // in_ch >= 1; norm += x * x - for (int c = 1; c < n; ++c) { - norm_tmp = norm_ptr + p * post; - for (int i = 0; i < post; ++i) { - *norm_tmp += (*in_tmp) * (*in_tmp); - norm_tmp++; - in_tmp++; - } - } - - // norm = sqart(norm) - norm_tmp = norm_ptr + p * post; - for (int i = 0; i < post; ++i) { - *norm_tmp = sqrtf(*norm_tmp); - norm_tmp++; - } - - // out = input / norm - in_tmp = input_ptr + p * n * post; - float *out_tmp = out_ptr + p * n * post; - for (int c = 0; c < n; ++c) { - norm_tmp = norm_ptr + p * post; - for (int j = 0; j < post; ++j) { - *out_tmp = *in_tmp / *norm_tmp; - in_tmp++; - norm_tmp++; - out_tmp++; - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h b/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h deleted file mode 100644 index 9cbac1035f..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POLYGONBOXTRANSFORM_OP -#pragma once - -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void PolygonBoxTransformCompute(const PolygonBoxTransformParam& param) { - const auto* input = param.Input(); - const auto& input_dims = input->dims(); - const auto* input_data = input->data(); - auto* output = param.Output(); - auto* output_data = output->mutable_data(input_dims); - - int64_t batch_size = input_dims[0]; - int64_t geo_channel = input_dims[1]; - int64_t height = input_dims[2]; - int64_t width = input_dims[3]; - int64_t id = 0; - for (int64_t id_n = 0; id_n < batch_size * geo_channel; ++id_n) { - for (int64_t id_h = 0; id_h < height; ++id_h) { - for (int64_t id_w = 0; id_w < width; ++id_w) { - id = id_n * height * width + width * id_h + id_w; - if (id_n % 2 == 0) { - output_data[id] = id_w * 4 - input_data[id]; - } else { - output_data[id] = id_h * 4 - input_data[id]; - } - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h b/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h deleted file mode 100644 index 82c24d0ab4..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#pragma once - -#include -#include -#include "common/types.h" -#include "operators/math/pooling.h" - -namespace paddle_mobile { -namespace operators { - -template -void PoolCompute(const PoolParam ¶m) { - const framework::Tensor *input = param.Input(); - framework::Tensor *output = param.Output(); - const std::string &pooling_type = param.PoolingType(); - std::vector ksize = param.Ksize(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - const bool exclusive = param.isExclusive(); - if (param.isGlobalPooling()) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(input->dims()[i + 2]); - } - } - if (ksize[0] == 3 && ksize[0] == ksize[1]) { - if (pooling_type == "max" && strides[0] == strides[1]) { - if (strides[0] == 1) { - math::Pooling3x3()(*input, paddings, exclusive, output); - } else if (strides[0] == 2) { - math::Pooling3x3()(*input, paddings, exclusive, output); - } else { - math::Pooling()(*input, ksize, strides, paddings, output); - } - } else if (pooling_type == "avg" && strides[0] == strides[1]) { - if (strides[0] == 1) { - math::Pooling3x3()(*input, paddings, exclusive, output); - } else if (strides[0] == 2) { - math::Pooling3x3()(*input, paddings, exclusive, output); - } else { - math::Pooling()(*input, ksize, strides, paddings, output); - } - } - } else if (ksize[0] == 2 && ksize[0] == ksize[1]) { - if (pooling_type == "max" && strides[0] == strides[1]) { - if (strides[0] == 1) { - math::Pooling2x2()(*input, paddings, output); - } else if (strides[0] == 2) { - math::Pooling2x2()(*input, paddings, output); - } else { - math::Pooling()(*input, ksize, strides, paddings, output); - } - } else if (pooling_type == "avg" && strides[0] == strides[1]) { - if (strides[0] == 1) { - math::Pooling2x2()(*input, paddings, output); - } else if (strides[0] == 2) { - math::Pooling2x2()(*input, paddings, output); - } else { - math::Pooling()(*input, ksize, strides, paddings, output); - } - } - } else { - if (pooling_type == "max") { - math::Pooling()(*input, ksize, strides, paddings, output); - } else if (pooling_type == "avg") { - math::Pooling()(*input, ksize, strides, paddings, output); - } else { - // Others - } - } -} - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h b/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h deleted file mode 100644 index e783c52f81..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h +++ /dev/null @@ -1,199 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PRIORBOX_OP -#pragma once - -#include -#include -#include - -namespace paddle_mobile { -namespace operators { - -template -struct ClipFunctor { - inline T operator()(T in) const { - return std::min(std::max(in, 0.), 1.); - } -}; - -template -void PriorBoxCompute(const PriorBoxParam ¶m) { - const auto *input_ = param.Input(); - const auto &input_dims = input_->dims(); - - const auto *input_image = param.InputImage(); - const auto &input_image_dims = input_image->dims(); - - const auto &min_sizes = param.MinSizes(); - const auto &max_sizes = param.MaxSizes(); - const auto &variances = param.Variances(); - const auto &input_aspect_ratio = param.AspectRatios(); - const bool &flip = param.Flip(); - const bool &clip = param.Clip(); - const float &step_w = param.StepW(); - const float &step_h = param.StepH(); - const float &offset = param.Offset(); - - Tensor *output_boxes = param.OutputBoxes(); - auto output_boxes_dataptr = output_boxes->mutable_data(); - Tensor *output_variances = param.OutputVariances(); - auto output_variances_dataptr = output_variances->mutable_data(); - - std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); - - auto img_width = input_image_dims[3]; - auto img_height = input_image_dims[2]; - - auto feature_width = input_dims[3]; - auto feature_height = input_dims[2]; - - auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] * - output_boxes->dims()[3]; - auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3]; - auto stride2 = output_boxes->dims()[3]; - - float step_width, step_height; - /// 300 / 19 - if (step_w == 0 || step_h == 0) { - step_width = static_cast(img_width) / feature_width; - step_height = static_cast(img_height) / feature_height; - } else { - step_width = step_w; - step_height = step_h; - } - - int num_priors = aspect_ratios.size() * min_sizes.size(); - if (!max_sizes.empty()) { - num_priors += max_sizes.size(); - } - - for (int h = 0; h < feature_height; ++h) { - for (int w = 0; w < feature_width; ++w) { - /// map origin image - float center_x = (w + offset) * step_width; - float center_y = (h + offset) * step_height; - float box_width, box_height; - int idx = 0; - for (size_t s = 0; s < min_sizes.size(); ++s) { - auto min_size = min_sizes[s]; - if (param.MinMaxAspectRatiosOrder()) { - box_width = box_height = min_size / 2.; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] = - (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] = - (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] = - (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] = - (center_y + box_height) / img_height; - idx++; - - if (max_sizes.size() > 0) { - auto max_size = max_sizes[s]; - // square prior with size sqrt(minSize * maxSize) - box_width = box_height = sqrt(min_size * max_size) / 2.; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = (center_y + box_height) / img_height; - idx++; - } - - // priors with different aspect ratios - for (float ar : aspect_ratios) { - if (fabs(ar - 1.) < 1e-6) { - continue; - } - box_width = min_size * sqrt(ar) / 2.; - box_height = min_size / sqrt(ar) / 2.; - /// box_width/2 , / img_width 为了得到feature map 相对于 - /// 原图的归一化位置的比例。 - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = (center_y + box_height) / img_height; - idx++; - } - - } else { - // priors with different aspect ratios - for (float ar : aspect_ratios) { - box_width = min_size * sqrt(ar) / 2.; - box_height = min_size / sqrt(ar) / 2.; - /// box_width/2 , / img_width 为了得到feature map 相对于 - /// 原图的归一化位置的比例。 - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = (center_y + box_height) / img_height; - idx++; - } - if (!max_sizes.empty()) { - auto max_size = max_sizes[s]; - // square prior with size sqrt(minSize * maxSize) - box_width = box_height = sqrt(min_size * max_size) / 2.; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = (center_y + box_height) / img_height; - idx++; - } - } - } - } - } - if (clip) { - math::Transform trans; - ClipFunctor clip_func; - trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(), - output_boxes_dataptr, clip_func); - } - - if ((variances.size() != 4)) { - LOG(kLOG_ERROR) << " variances.size() must be 4."; - } - - int64_t box_num = feature_height * feature_width * num_priors; - - for (int i = 0; i < box_num; i++) { - output_variances_dataptr[4 * i] = variances[0]; - output_variances_dataptr[4 * i + 1] = variances[1]; - output_variances_dataptr[4 * i + 2] = variances[2]; - output_variances_dataptr[4 * i + 3] = variances[3]; - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h b/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h deleted file mode 100644 index c22cf12031..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE2_OP -#pragma once - -#include -#include "operators/kernel/reshape_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void Reshape2Compute(const Reshape2Param ¶m) { - const auto *input_x = param.InputX(); - const auto &input_x_dims = input_x->dims(); - auto *out = param.Out(); - framework::DDim out_dims = out->dims(); - const auto *input_shape = param.InputShape(); - - if (input_shape) { - auto *shape_data = input_shape->data(); - framework::Tensor cpu_shape_tensor; - auto shape = - std::vector(shape_data, shape_data + input_shape->numel()); - out_dims = ValidateShape(shape, input_x->dims()); - } else { - auto &shape = param.Shape(); - out_dims = ValidateShape(shape, input_x_dims); - } - - bool inplace = param.Inplace(); - out->Resize(out_dims); - if (!inplace) { - out->mutable_data(); - framework::TensorCopy(*input_x, out); - out->Resize(out_dims); - } else { - out->ShareDataWith(*input_x); - out->Resize(out_dims); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h b/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h deleted file mode 100644 index 6e1a29dee6..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE_OP -#pragma once - -#include -#include "operators/kernel/reshape_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void ReshapeCompute(const ReshapeParam ¶m) { - const auto *input_x = param.InputX(); - const auto &input_x_dims = input_x->dims(); - auto *out = param.Out(); - framework::DDim out_dims = out->dims(); - const auto *input_shape = param.InputShape(); - - if (input_shape) { - auto *shape_data = input_shape->data(); - framework::Tensor cpu_shape_tensor; - auto shape = - std::vector(shape_data, shape_data + input_shape->numel()); - out_dims = ValidateShape(shape, input_x->dims()); - } - - bool inplace = param.Inplace(); - out->Resize(out_dims); - if (!inplace) { - out->mutable_data(); - framework::TensorCopy(*input_x, out); - out->Resize(out_dims); - } else { - out->ShareDataWith(*input_x); - out->Resize(out_dims); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h b/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h deleted file mode 100644 index fa9154211f..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SHAPE_OP -#pragma once - -#include -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void ShapeCompute(const ShapeParam& param) { - auto* in_t = param.Input(); - auto* out_t = param.Out(); - auto out_data = out_t->mutable_data(); - auto in_dims = in_t->dims(); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = static_cast(in_dims[i]); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h deleted file mode 100644 index 29d63937ba..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP -#pragma once -#include "../../math/softmax.h" -#include "operators/op_param.h" -namespace paddle_mobile { -namespace operators { - -void softmax_basic_axis_float(const float *din, float *dout, - const int axis_size, const int inner_num, - const int outer_num) { - int compute_size = inner_num * outer_num; -#pragma omp parallel for - for (int i = 0; i < compute_size; ++i) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - float max_data = din[real_index]; - // get max - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - max_data = din[real_index] > max_data ? din[real_index] : max_data; - } - - real_index = idx_outer * inner_num + idx_inner; - // sub, exp and sum - dout[real_index] = expf(din[real_index] - max_data); - float sum_data = dout[real_index]; - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - dout[real_index] = expf(din[real_index] - max_data); - sum_data += dout[real_index]; - } - - float sum_inv = 1.f / sum_data; - real_index = idx_outer * inner_num + idx_inner; - // get softmax result - for (int j = 0; j < axis_size; ++j) { - dout[real_index] *= sum_inv; - real_index += inner_num; - } - } -} - -template -void SoftmaxCompute(const SoftmaxParam ¶m) { - const Tensor *in_x = param.InputX(); - Tensor *out = param.Out(); - auto x_dims = in_x->dims(); - out->Resize(x_dims); - out->mutable_data(); - if (param.has_axis_) { - int axis = param.axis_; - int axis_size = x_dims[axis]; - auto x_rank = x_dims.size(); - DLOG << "x_rank :" << x_rank; - - if (axis < 0) { - axis += x_rank; - } - - DLOG << "axis :" << axis; - - int outer_num = framework::product(framework::slice_ddim(x_dims, 0, axis)); - DLOG << "outer_num :" << outer_num; - int inner_num = - framework::product(framework::slice_ddim(x_dims, axis + 1, x_rank)); - DLOG << "inner_num :" << inner_num; - - softmax_basic_axis_float(in_x->data(), out->data(), axis_size, - inner_num, outer_num); - } else { - math::SoftmaxFuntor()(in_x, out); - } -} -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/split_arm_func.h b/mobile/src/operators/kernel/central-arm-func/split_arm_func.h deleted file mode 100644 index 24ab2f83a4..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/split_arm_func.h +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SPLIT_OP -#pragma once - -#include -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -// Strided numel memory copy from src to dst by the specified axis -// -// For example, for a tensor dims [4, 20, 100], the strieded numel is -// [8000, 2000, 100] -// -// NOTE: The src and dst tensor should have the same elements -// except the specified axis. -template -inline void StridedNumelCopyWithAxis(int64_t axis, T* dst, - const framework::DDim& dst_stride_numel, - const T* src, - const framework::DDim& src_stride_numel, - int64_t size) { - int64_t before = dst_stride_numel[0] / dst_stride_numel[axis]; - int64_t src_after = src_stride_numel[axis]; - int64_t dst_after = dst_stride_numel[axis]; - - PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(), - "src and dst tensor should have the same dims size."); - - for (int64_t i = 0; i < axis; ++i) { - if (i < axis) { - PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] == - dst_stride_numel[i] / dst_stride_numel[axis], - "src and dst should have the same elements " - "except the specified axis."); - } else if (i == axis) { - continue; - } else { - PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i], - "src and dst should have the same elements " - "except the specified axis."); - } - } - - for (int64_t i = 0; i < before; ++i) { - memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size); - } -} - -template -void SplitCompute(const SplitParam& param) { - auto* in = param.InputX(); - auto outs = param.Outs(); - auto in_stride = framework::stride_numel(in->dims()); - int64_t axis = param.Axis(); - - size_t input_offset = 0; - for (auto& out : outs) { - out->mutable_data(); - auto out_stride = framework::stride_numel(out->dims()); - - StridedNumelCopyWithAxis(axis, out->data(), out_stride, - in->data() + input_offset, in_stride, - out_stride[axis]); - input_offset += out_stride[axis]; - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h b/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h deleted file mode 100644 index 7d41c898db..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SUM_OP -#pragma once - -#include -#include "operators/math/selected_rows_functor.h" - -namespace paddle_mobile { -namespace operators { - -using LoDTensorArray = std::vector; - -template -void SumCompute(const SumParam ¶m) { - auto inputsvars = param.InputsVars(); - int N = inputsvars.size(); - auto *outvar = param.OutVar(); - - bool in_place = outvar == inputsvars[0]; - if (outvar->IsType()) { - auto *out = outvar->GetMutable(); - if (!in_place) { - out->mutable_data(); - } - auto *outptr = out->data(); - // auto result = Flatten(*out); - - if (!in_place) { - std::fill(out->data(), out->data() + out->numel(), 0); - } - math::SelectedRowsAddToTensor functor; - for (int i = in_place ? 1 : 0; i < N; i++) { - if (inputsvars[i]->IsType()) { - auto *in_t = inputsvars[i]->Get(); - auto *inptr = in_t->data(); - if (in_t->numel() == 0) { - continue; - } - for (int j = 0; j < out->numel(); ++j) { - outptr[j] = outptr[j] + inptr[j]; - } - - } else if (inputsvars[i]->IsType()) { - auto *in_t = inputsvars[i]->Get(); - functor(*in_t, out); - } else { - PADDLE_MOBILE_THROW_EXCEPTION( - "Variable type must be LoDTensor/SelectedRows."); - } - } - - } else if (outvar->IsType()) { - std::unique_ptr in0; - if (in_place) { - // If is in_place, we store the input[0] to in0 - auto *in_sel0 = inputsvars[0]->Get(); - auto &rows = in_sel0->rows(); - in0.reset(new framework::SelectedRows(rows, in_sel0->height())); - in0->mutable_value()->ShareDataWith(in_sel0->value()); - } - - auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & { - if (i == 0 && in0) { - return *in0.get(); - } else { - return *(inputsvars[i]->Get()); - } - }; - - auto *out = outvar->GetMutable(); - out->mutable_rows()->clear(); - auto *out_value = out->mutable_value(); - - // Runtime InferShape - size_t first_dim = 0; - for (int i = 0; i < N; i++) { - auto &sel_row = get_selected_row(i); - first_dim += sel_row.rows().size(); - } - auto in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); - in_dim[0] = static_cast(first_dim); - - out_value->Resize(framework::make_ddim(in_dim)); - - // if all the input sparse vars are empty, no need to - // merge these vars. - if (first_dim == 0UL) { - return; - } - out_value->mutable_data(); - math::SelectedRowsAddTo functor; - - int64_t offset = 0; - for (int i = 0; i < N; i++) { - auto &sel_row = get_selected_row(i); - if (sel_row.rows().size() == 0) { - continue; - } - PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height(), - "seletrows height != outheight"); - functor(sel_row, offset, out); - offset += sel_row.value().numel(); - } - } else if (outvar->IsType()) { - auto &out_array = *outvar->GetMutable(); - for (size_t i = in_place ? 1 : 0; i < inputsvars.size(); ++i) { - PADDLE_MOBILE_ENFORCE(inputsvars[i]->IsType(), - "Only support all inputs are TensorArray"); - auto *in_array = inputsvars[i]->Get(); - - for (size_t i = 0; i < in_array->size(); ++i) { - if ((*in_array)[i].numel() != 0) { - if (i >= out_array.size()) { - out_array.resize(i + 1); - } - if (out_array[i].numel() == 0) { - framework::TensorCopy((*in_array)[i], &out_array[i]); - out_array[i].set_lod((*in_array)[i].lod()); - } else { - PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod(), - "outLod != inLod"); - auto *inptr = (*in_array)[i].data(); - auto *outptr = out_array[i].data(); - - for (int j = 0; j < (*in_array)[i].numel(); ++j) { - outptr[j] = inptr[j] + outptr[j]; - } - } - } - } - } - } else { - PADDLE_MOBILE_THROW_EXCEPTION( - "Unexpected branch, output variable type is %d", outvar->Type()); - } -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h b/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h deleted file mode 100644 index ef3d38eff2..0000000000 --- a/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TRANSPOSE_OP -#pragma once - -#include -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void TransposeCompute(const TransposeParam& param) { - const auto* input_x = param.InputX(); - const auto input_x_dims = input_x->dims(); - auto* out = param.Out(); - const auto axis = param.Axis(); - const auto* input_x_data = input_x->data(); - auto* out_data = out->mutable_data(); - - size_t ndim = axis.size(); - std::vector xdim(ndim); - std::vector xstride(ndim); - std::vector xout(ndim); - for (int i = 0; i < ndim; i++) { - int j = ndim - 1 - i; - xdim[j] = input_x_dims[axis[i]]; - xstride[j] = 1; - for (int k = axis[i] + 1; k < ndim; k++) { - xstride[j] *= input_x_dims[k]; - } - xout[j] = xstride[j] * xdim[j]; - } - - auto numel = input_x->numel(); - size_t pind = 0; - std::vector ind(ndim); - for (int i = 0; i < numel; i++) { - out_data[i] = input_x_data[pind]; - ind[0]++; - pind += xstride[0]; - for (int j = 0; j < ndim - 1; j++) { - if (ind[j] == xdim[j]) { - ind[j + 1]++; - ind[j] = 0; - pind += xstride[j + 1]; - pind -= xout[j]; - } else { - break; - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp b/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp deleted file mode 100644 index 6e5039cf05..0000000000 --- a/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BATCHNORM_OP - -#include "operators/kernel/batchnorm_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool BatchNormKernel::Init(BatchNormParam *param) { - this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl"); - const framework::CLImage *mean = param->InputMean(); - const framework::CLImage *variance = param->InputVariance(); - const framework::CLImage *scale = param->InputScale(); - const framework::CLImage *bias = param->InputBias(); - const float epsilon = param->Epsilon(); - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - - const int C = mean->numel(); - float inv_std_ptr[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - float *new_scale_ptr = new float[C]; - float *new_bias_ptr = new float[C]; - - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; - } - - framework::CLImage *new_scale = new framework::CLImage(); - new_scale->SetTensorData(new_scale_ptr, variance->dims()); - new_scale->InitCLImage(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - - framework::CLImage *new_bias = new framework::CLImage(); - new_bias->SetTensorData(new_bias_ptr, variance->dims()); - new_bias->InitCLImage(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); - - delete[](new_scale_ptr); - delete[](new_bias_ptr); - - return true; -} - -template <> -void BatchNormKernel::Compute( - const BatchNormParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY()); - - auto input = param.InputX()->GetCLImage(); - auto out = param.OutputY()->GetCLImage(); - auto new_scale = param.NewScale()->GetCLImage(); - auto new_bias = param.NewBias()->GetCLImage(); - const int out_width = default_work_size[1]; - DLOG << *param.InputX(); - DLOG << *param.NewBias(); - DLOG << *param.NewScale(); - DLOG << default_work_size[0]; - DLOG << default_work_size[1]; - DLOG << default_work_size[2]; - DLOG << out_width; - DLOG << *param.OutputY(); - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &out); - CL_CHECK_ERRORS(status); - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -template class BatchNormKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp b/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp deleted file mode 100644 index 362cf5bb25..0000000000 --- a/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BILINEAR_INTERP_OP - -#include - -namespace paddle_mobile { -namespace operators { -template <> -bool BilinearInterpKernel::Init( - paddle_mobile::operators::BilinearInterpParam - *param) { - this->cl_helper_.AddKernel("bilinear_interp", "bilinear_interp_kernel.cl"); - return true; -} - -template <> -void BilinearInterpKernel::Compute( - const paddle_mobile::operators::BilinearInterpParam - ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); - auto input = param.InputX(); - cl_mem input_image = input->GetCLImage(); - auto output = param.Out(); - cl_mem output_image = output->GetCLImage(); - float scale_h, scale_w; - if (param.AlignCorners()) { - scale_h = (input->dims()[2] - 1.0f) / (output->dims()[2] - 1.0f); - scale_w = (input->dims()[3] - 1.0f) / (output->dims()[3] - 1.0f); - } else { - scale_h = input->dims()[2] / static_cast(output->dims()[2]); - scale_w = input->dims()[3] / static_cast(output->dims()[3]); - } - float align_delta = 0.0f; - if (!param.AlignCorners() && param.AlignMode() == 0) { - align_delta = 0.5f; - } - int in_dims_h = input->dims()[2]; - int out_dims_h = output->dims()[2]; - int in_dims_w = input->dims()[3]; - int out_dims_w = output->dims()[3]; - - cl_int status; - - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_h); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 5, sizeof(int), &out_dims_h); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 6, sizeof(int), &in_dims_w); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 7, sizeof(int), &out_dims_w); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 8, sizeof(float), &align_delta); - CL_CHECK_ERRORS(status) - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status) -} -template class BilinearInterpKernel; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/box_coder_kernel.cpp b/mobile/src/operators/kernel/cl/box_coder_kernel.cpp deleted file mode 100644 index b98435f9b0..0000000000 --- a/mobile/src/operators/kernel/cl/box_coder_kernel.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef BOXCODER_OP - -#include "operators/kernel/box_coder_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool BoxCoderKernel::Init(BoxCoderParam* param) { - if (param->CodeType() == "decode_center_size") { - this->cl_helper_.AddKernel("box_decoder", "box_coder_kernel.cl"); - } - return true; -} - -template <> -void BoxCoderKernel::Compute( - const BoxCoderParam& param) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputBox()); - const auto* input_priorbox = param.InputPriorBox(); - const auto* input_priorboxvar = param.InputPriorBoxVar(); - const auto* input_targetbox = param.InputTargetBox(); - const auto& code_type = param.CodeType(); - if (code_type == "decode_center_size") { - auto prior_box_image = input_priorbox->GetCLImage(); - auto prior_box_var_image = input_priorboxvar->GetCLImage(); - auto target_box_image = input_targetbox->GetCLImage(); - auto output_image = param.OutputBox()->GetCLImage(); - auto& outputDim = param.OutputBox()->dims(); - int new_dims[4] = {1, 1, 1, 1}; - for (int i = 0; i < outputDim.size(); i++) { - new_dims[4 - outputDim.size() + i] = outputDim[i]; - } - int out_C = new_dims[1]; - int out_H = new_dims[2]; - DLOG << "out_C=" << out_C; - DLOG << "out_H=" << out_H; - DLOG << "default_work_size=" << default_work_size; - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &prior_box_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &prior_box_var_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &target_box_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &out_H); - CL_CHECK_ERRORS(status); - size_t global_work_size[2] = {default_work_size[0], default_work_size[2]}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp deleted file mode 100644 index a4dfd8321e..0000000000 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp +++ /dev/null @@ -1,1140 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef CONV_OP - -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" -#include -#include "framework/cl/cl_image_converter.h" -#include "framework/cl/cl_tensor.h" - -namespace paddle_mobile { -namespace operators { -bool use_lws = true; -int preferred_lws = 0; -int preferred_lws_divisor = 2; - -template <> -void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper, - framework::CLImage *weight) {} - -template <> -void WinogradConv3x3<4, 3>(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu, - const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) {} - -void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu, - const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { - auto kernel = cl_helper->KernelAt(0); - auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); - default_work_size[1] = (default_work_size[1] + 1) / 2; - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - int output_c = param.Output()->dims()[1]; - int filter_channel = param.Filter()->dims()[1]; - int input_channel = param.Input()->dims()[1]; - // - // DLOG << " c block " << c_block; - // DLOG << " w " << w; - // DLOG << " nh " << nh; - // DLOG << " stride " << stride; - // DLOG << " offset " << offset; - // DLOG << " input_c " << input_c; - // DLOG << " dilation " << dilation; - // DLOG << " input width " << input_width; - // DLOG << " input height " << input_height; - // DLOG << " output width " << output_width; - // DLOG << " output height " << output_height; - // DLOG << " input dim " << param.Input()->dims(); - // DLOG << " output dim " << param.Output()->dims(); - // DLOG << " filter dim " << param.Filter()->dims(); - - cl_int status; - int index = 0; - - status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - if (biase) { - auto bias_mem = biase->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); - CL_CHECK_ERRORS(status); - } - - if (new_scale && new_bias) { - auto new_scale_mem = new_scale->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); - CL_CHECK_ERRORS(status); - - auto new_bias_mem = new_bias->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); - CL_CHECK_ERRORS(status); - } - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) { - if (filter_channel != input_channel) { - if (filter_channel != 1) { - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel); - CL_CHECK_ERRORS(status); - int has_group = 1; - status = clSetKernelArg(kernel, index++, sizeof(int), &has_group); - CL_CHECK_ERRORS(status); - } - } else { - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel); - CL_CHECK_ERRORS(status); - int has_group = 0; - status = clSetKernelArg(kernel, index++, sizeof(int), &has_group); - CL_CHECK_ERRORS(status); - } - } - // DLOG<<"default_work_size"<KernelWorkSize(kernel); - auto tmp0 = default_work_size.data()[0]; - auto tmp1 = default_work_size.data()[1]; - auto tmp2 = default_work_size.data()[2]; - int max_work_size = static_cast(kernel_work_size); - if (preferred_lws_divisor > 1) { - max_work_size /= preferred_lws_divisor; - } - if (preferred_lws > 0 && preferred_lws <= max_work_size) { - max_work_size = preferred_lws; - } - while (tmp1 > max_work_size && max_work_size > 0) { - tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; - } - while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { - tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; - } - while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { - tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; - } - const size_t local_work_size[3] = {static_cast(tmp0), - static_cast(tmp1), - static_cast(tmp2)}; - if (max_work_size > 0 && use_lws) { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), local_work_size, 0, NULL, NULL); - } else { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - } - CL_CHECK_ERRORS(status); -} - -void ConvAddBnRelu(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu, - const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { - auto kernel = cl_helper->KernelAt(0); - auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int input_c_origin = param.Input()->dims()[1]; - int dilation = param.Dilations()[0]; - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - int output_c = param.Output()->dims()[1]; - int filter_channel = param.Filter()->dims()[1]; - int input_channel = param.Input()->dims()[1]; - - // DLOG << " c block " << c_block; - // DLOG << " w " << w; - // DLOG << " nh " << nh; - // DLOG << " stride " << stride; - // DLOG << " offset " << offset; - // DLOG << " input_c " << input_c; - // DLOG << " dilation " << dilation; - // DLOG << " input width " << input_width; - // DLOG << " input height " << input_height; - // DLOG << " output width " << output_width; - // DLOG << " output height " << output_height; - // DLOG << " input dim " << param.Input()->dims(); - // DLOG << " output dim " << param.Output()->dims(); - // DLOG << " filter dim " << param.Filter()->dims(); - - cl_int status; - int index = 0; - - const int filter_height = param.Filter()->dims()[2]; - const int filter_width = param.Filter()->dims()[3]; - if (filter_height == 1 && filter_width == 1) { - status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - int maped_w = maptofactor(w, 4); - status = clSetKernelArg(kernel, index++, sizeof(int), &maped_w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - if (biase) { - auto bias_mem = biase->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); - CL_CHECK_ERRORS(status); - } - - if (new_scale && new_bias) { - auto new_scale_mem = new_scale->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); - CL_CHECK_ERRORS(status); - - auto new_bias_mem = new_bias->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); - CL_CHECK_ERRORS(status); - } - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_c_origin); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - const size_t work_size[3] = { - static_cast(default_work_size.data()[0]), - static_cast(maped_w), - static_cast(default_work_size.data()[2])}; - - auto kernel_work_size = cl_helper->KernelWorkSize(kernel); - auto tmp0 = work_size[0]; - auto tmp1 = work_size[1]; - auto tmp2 = work_size[2]; - int max_work_size = static_cast(kernel_work_size); - if (preferred_lws_divisor > 1) { - max_work_size /= preferred_lws_divisor; - } - if (preferred_lws > 0 && preferred_lws <= max_work_size) { - max_work_size = preferred_lws; - } - while (tmp1 > max_work_size && max_work_size > 0) { - tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; - } - while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { - tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; - } - while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { - tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; - } - const size_t local_work_size[3] = {static_cast(tmp0), - static_cast(tmp1), - static_cast(tmp2)}; - if (max_work_size > 0 && use_lws) { - status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, - default_work_size.size(), NULL, work_size, - local_work_size, 0, NULL, NULL); - } else { - status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, - default_work_size.size(), NULL, work_size, - NULL, 0, NULL, NULL); - } - CL_CHECK_ERRORS(status); - } else { - status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - if (biase) { - auto bias_mem = biase->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); - CL_CHECK_ERRORS(status); - } - - if (new_scale && new_bias) { - auto new_scale_mem = new_scale->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); - CL_CHECK_ERRORS(status); - - auto new_bias_mem = new_bias->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); - CL_CHECK_ERRORS(status); - } - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - if (filter_height == 3 && filter_width == 3) { - // normal conv - if (param.Filter()->dims()[0] == param.Output()->dims()[1] && - param.Filter()->dims()[1] == param.Input()->dims()[1]) { - status = clSetKernelArg(kernel, index++, sizeof(int), &output_c); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel); - CL_CHECK_ERRORS(status); - int group = 1; - status = clSetKernelArg(kernel, index++, sizeof(int), &group); - CL_CHECK_ERRORS(status); - } else if (!(param.Filter()->dims()[0] == param.Input()->dims()[1] && - param.Filter()->dims()[1] == 1)) { // not depwise - status = clSetKernelArg(kernel, index++, sizeof(int), &output_c); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel); - CL_CHECK_ERRORS(status); - int group = input_channel / filter_channel; - status = clSetKernelArg(kernel, index++, sizeof(int), &group); - CL_CHECK_ERRORS(status); - } - } else if (filter_height != 3 && filter_width != 3) { - // not 3x3 - if (param.Filter()->dims()[1] == 1 && - param.Input()->dims()[1] == param.Output()->dims()[1]) { - // deepwise basic use in not 3x3 - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height); - CL_CHECK_ERRORS(status); - } - } - - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } -} - -void DWConvAddBnRelu(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu, - const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { - auto kernel = cl_helper->KernelAt(0); - auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - int w_blk_size = 2; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - - default_work_size[1] = w_blk; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int pad = param.Paddings()[0]; - int dilation = param.Dilations()[0]; - - int input_channel = param.Input()->dims()[1]; - int input_height = param.Input()->dims()[2]; - int input_width = param.Input()->dims()[3]; - - int output_height = param.Output()->dims()[2]; - int output_width = param.Output()->dims()[3]; - - // DLOG << " w " << w; - // DLOG << " nh " << nh; - // DLOG << " stride " << stride; - // DLOG << " dilation " << dilation; - // DLOG << " input width " << input_width; - // DLOG << " input height " << input_height; - // DLOG << " output width " << output_width; - // DLOG << " output height " << output_height; - // DLOG << " input dim " << param.Input()->dims(); - // DLOG << " output dim " << param.Output()->dims(); - // DLOG << " filter dim " << param.Filter()->dims(); - - cl_int status; - int index = 0; - - status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - if (biase) { - auto bias_mem = biase->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); - CL_CHECK_ERRORS(status); - } - - if (new_scale && new_bias) { - auto new_scale_mem = new_scale->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); - CL_CHECK_ERRORS(status); - - auto new_bias_mem = new_bias->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); - CL_CHECK_ERRORS(status); - } - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &pad); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - auto kernel_work_size = cl_helper->KernelWorkSize(kernel); - auto tmp0 = default_work_size.data()[0]; - auto tmp1 = default_work_size.data()[1]; - auto tmp2 = default_work_size.data()[2]; - int max_work_size = static_cast(kernel_work_size); - if (preferred_lws_divisor > 1) { - max_work_size /= preferred_lws_divisor; - } - if (preferred_lws > 0 && preferred_lws <= max_work_size) { - max_work_size = preferred_lws; - } - while (tmp1 > max_work_size && max_work_size > 0) { - tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; - } - while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { - tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; - } - while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { - tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; - } - const size_t local_work_size[3] = {static_cast(tmp0), - static_cast(tmp1), - static_cast(tmp2)}; - if (max_work_size > 0 && use_lws) { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), local_work_size, 0, NULL, NULL); - } else { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - } - - CL_CHECK_ERRORS(status); -} - -void SWConvAddBnRelu(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu, - const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { - auto kernel = cl_helper->KernelAt(0); - auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - - int w_blk_size = 5; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - default_work_size[1] = w_blk; - - int h_blk_size = 1; - int h_blk = (nh + h_blk_size - 1) / h_blk_size; - default_work_size[2] = h_blk; - - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int pad = param.Paddings()[0]; - int dilation = param.Dilations()[0]; - - int input_channel = param.Input()->dims()[1]; - int input_height = param.Input()->dims()[2]; - int input_width = param.Input()->dims()[3]; - int output_height = param.Output()->dims()[2]; - int output_width = param.Output()->dims()[3]; - - cl_int status; - int index = 0; - - status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - if (biase) { - auto bias_mem = biase->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); - CL_CHECK_ERRORS(status); - } - - if (new_scale && new_bias) { - auto new_scale_mem = new_scale->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); - CL_CHECK_ERRORS(status); - - auto new_bias_mem = new_bias->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); - CL_CHECK_ERRORS(status); - } - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &pad); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - auto kernel_work_size = cl_helper->KernelWorkSize(kernel); - auto tmp0 = default_work_size.data()[0]; - auto tmp1 = default_work_size.data()[1]; - auto tmp2 = default_work_size.data()[2]; - int max_work_size = static_cast(kernel_work_size); - if (preferred_lws_divisor > 1) { - max_work_size /= preferred_lws_divisor; - } - if (preferred_lws > 0 && preferred_lws <= max_work_size) { - max_work_size = preferred_lws; - } - while (tmp1 > max_work_size && max_work_size > 0) { - tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; - } - while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { - tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; - } - while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { - tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; - } - const size_t local_work_size[3] = {static_cast(tmp0), - static_cast(tmp1), - static_cast(tmp2)}; - if (max_work_size > 0 && use_lws) { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), local_work_size, 0, NULL, NULL); - } else { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - } - CL_CHECK_ERRORS(status); -} - -void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu, const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { - auto kernel = cl_helper->KernelAt(0); - auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - - int w_blk_size = 1; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - default_work_size[1] = w_blk; - - int h_blk_size = 1; - int h_blk = (nh + h_blk_size - 1) / h_blk_size; - default_work_size[2] = h_blk; - - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int pad = param.Paddings()[0]; - int dilation = param.Dilations()[0]; - - int input_channel = param.Input()->dims()[1]; - int input_height = param.Input()->dims()[2]; - int input_width = param.Input()->dims()[3]; - - int output_height = param.Output()->dims()[2]; - int output_width = param.Output()->dims()[3]; - - int filter_height = param.Filter()->dims()[2]; - int filter_width = param.Filter()->dims()[3]; - - cl_int status; - int index = 0; - - status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - if (biase) { - auto bias_mem = biase->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); - CL_CHECK_ERRORS(status); - } - - if (new_scale && new_bias) { - auto new_scale_mem = new_scale->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); - CL_CHECK_ERRORS(status); - - auto new_bias_mem = new_bias->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); - CL_CHECK_ERRORS(status); - } - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &pad); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height); - CL_CHECK_ERRORS(status); - - if (default_work_size.data()[1] % 60 == 0 && use_lws) { - const size_t local_work_size[3] = {static_cast(1), - static_cast(60), - static_cast(1)}; - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), local_work_size, 0, NULL, NULL); - } else { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - } - CL_CHECK_ERRORS(status); -} - -void ConvTransposeAddBnRelu_b(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu, const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { - auto kernel = cl_helper->KernelAt(0); - const auto *input = param.Input(); - auto *output = param.Output(); - auto *filter = param.Filter(); - const int n = input->dims()[0]; - const int input_c = input->dims()[1]; - const int input_c_block = (input_c + 3) / 4; - const int input_width = input->dims()[3]; - const int input_height = input->dims()[2]; - const int output_c = output->dims()[1]; - const int output_c_block = (output_c + 3) / 4; - const int output_width = output->dims()[3]; - const int output_height = output->dims()[2]; - - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - auto filterImage = filter->GetCLImage(); - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(int), &input_c_block); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &filterImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - - const size_t work_size[3] = {(size_t)output_c_block, (size_t)input_width, - (size_t)(n * input_height)}; - - DLOG << "conv transpose " << input_c_block << input_width << input_height - << output_width << output_height << work_size[0] << work_size[1] - << work_size[2]; - - clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL, - work_size, NULL, 0, NULL, NULL); -} -void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu, const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { - auto kernel = cl_helper->KernelAt(0); - auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - - int w_blk_size = 1; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - default_work_size[1] = w_blk; - - int h_blk_size = 1; - int h_blk = (nh + h_blk_size - 1) / h_blk_size; - default_work_size[2] = h_blk; - - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int pad = param.Paddings()[0]; - int dilation = param.Dilations()[0]; - - int input_channel = param.Input()->dims()[1]; - int input_height = param.Input()->dims()[2]; - int input_width = param.Input()->dims()[3]; - - int output_height = param.Output()->dims()[2]; - int output_width = param.Output()->dims()[3]; - - int filter_height = param.Filter()->dims()[2]; - int filter_width = param.Filter()->dims()[3]; - - cl_int status; - int index = 0; - - status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - if (biase) { - auto bias_mem = biase->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); - CL_CHECK_ERRORS(status); - } - - if (new_scale && new_bias) { - auto new_scale_mem = new_scale->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); - CL_CHECK_ERRORS(status); - - auto new_bias_mem = new_bias->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); - CL_CHECK_ERRORS(status); - } - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &pad); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height); - CL_CHECK_ERRORS(status); - - if (default_work_size.data()[1] % 60 == 0 && use_lws) { - const size_t local_work_size[3] = {static_cast(1), - static_cast(60), - static_cast(1)}; - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), local_work_size, 0, NULL, NULL); - } else { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - } - CL_CHECK_ERRORS(status); -} -void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu, const framework::CLImage *biase, - const framework::CLImage *new_scale, - const framework::CLImage *new_bias) { - auto kernel = cl_helper->KernelAt(0); - auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - - int w_blk_size = 5; - int w_blk = (w + w_blk_size - 1 + 5) / w_blk_size / 2 * 2; - default_work_size[1] = w_blk; - - int h_blk_size = 1; - int h_blk = (nh + h_blk_size - 1) / h_blk_size; - default_work_size[2] = h_blk; - - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int pad = param.Paddings()[0]; - int dilation = param.Dilations()[0]; - - int input_channel = param.Input()->dims()[1]; - int input_height = param.Input()->dims()[2]; - int input_width = param.Input()->dims()[3]; - - int output_height = param.Output()->dims()[2]; - int output_width = param.Output()->dims()[3]; - - int filter_height = param.Filter()->dims()[2]; - int filter_width = param.Filter()->dims()[3]; - - cl_int status; - int index = 0; - - status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - if (biase) { - auto bias_mem = biase->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); - CL_CHECK_ERRORS(status); - } - - if (new_scale && new_bias) { - auto new_scale_mem = new_scale->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); - CL_CHECK_ERRORS(status); - - auto new_bias_mem = new_bias->GetCLImage(); - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); - CL_CHECK_ERRORS(status); - } - - status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &pad); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height); - CL_CHECK_ERRORS(status); - - auto kernel_work_size = cl_helper->KernelWorkSize(kernel); - auto tmp0 = default_work_size.data()[0]; - auto tmp1 = default_work_size.data()[1]; - auto tmp2 = default_work_size.data()[2]; - int max_work_size = static_cast(kernel_work_size); - if (preferred_lws_divisor > 1) { - max_work_size /= preferred_lws_divisor; - } - if (preferred_lws > 0 && preferred_lws <= max_work_size) { - max_work_size = preferred_lws; - } - while (tmp1 > max_work_size && max_work_size > 0) { - tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; - } - while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { - tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; - } - while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { - tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; - } - const size_t local_work_size[3] = {static_cast(tmp0), - static_cast(tmp1), - static_cast(tmp2)}; - if (max_work_size > 0 && use_lws) { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), local_work_size, 0, NULL, NULL); - } else { - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - } - CL_CHECK_ERRORS(status); -} -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h deleted file mode 100644 index a2488aaa2d..0000000000 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(CONV_OP) || defined(CONV_TRANSPOSE_OP) - -#pragma once - -#include "framework/cl/cl_helper.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; } - -template -void winograd_transform_weight(framework::CLHelper *cl_helper, - framework::CLImage *weight); - -template -void WinogradConv3x3(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); - -void ConvAddBnRelu(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); - -void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); - -void DWConvAddBnRelu(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); - -void SWConvAddBnRelu(framework::CLHelper *cl_helper, - const ConvParam ¶m, bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); -void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); -void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); -void ConvTransposeAddBnRelu_b(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); -void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper, - const ConvTransposeParam ¶m, - bool ifRelu = false, - const framework::CLImage *biase = nullptr, - const framework::CLImage *new_scale = nullptr, - const framework::CLImage *new_bias = nullptr); - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp deleted file mode 100644 index 1f25d3436e..0000000000 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef INSTANCENORM_OP -#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h" -#include -namespace paddle_mobile { -namespace operators { -void InstanceNorm(framework::CLHelper *cl_helper, - const framework::CLImage *input, framework::CLImage *output, - float epsilon) { - auto kernel = cl_helper->KernelAt(0); - - auto &dims = output->dims(); - const int n = dims[0]; - const int c_group = (dims[1] + 3) / 4; - const int h = dims[2]; - const int w = dims[3]; - auto input_image = input->GetCLImage(); - auto out_image = output->GetCLImage(); - - // DLOG << "Epsilon: " << epsilon; - - auto local_work_size_info = cl_helper->LocalWorkSizeInfo(); - // - // DLOG << local_work_size_info.max_work_group_size; - // DLOG << local_work_size_info.max_work_item_size0; - // DLOG << local_work_size_info.max_work_item_size1; - // DLOG << local_work_size_info.max_work_item_size2; - int maxTotal = - std::min(static_cast(local_work_size_info.max_work_group_size), 256); - int local_work_size1 = - std::min(static_cast(local_work_size_info.max_work_item_size1), - std::min(256, w)); - int local_work_size2 = 1; - const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1, - (size_t)local_work_size2}; - const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1, - (size_t)local_work_size2}; - - // DLOG << "work_size" << work_size[0] << " " << work_size[1] << " " - // << work_size[2]; - // DLOG << "local_work_size" << local_work_size[0] << " " << - // local_work_size[1] - // << " " << local_work_size[2]; - cl_int status; - clSetKernelArg(kernel, 0, sizeof(cl_int), &w); - CL_CHECK_ERRORS(status); - clSetKernelArg(kernel, 1, sizeof(cl_int), &h); - CL_CHECK_ERRORS(status); - clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group); - CL_CHECK_ERRORS(status); - clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1); - CL_CHECK_ERRORS(status); - clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2); - CL_CHECK_ERRORS(status); - clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon); - CL_CHECK_ERRORS(status); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - clSetKernelArg(kernel, 7, sizeof(cl_mem), &out_image); - CL_CHECK_ERRORS(status); - clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL, - work_size, local_work_size, 0, NULL, NULL); -} -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h deleted file mode 100644 index 1e46ebf4ba..0000000000 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(INSTANCENORM_OP) || defined(FUSION_INSTANCENORM_RELU_OP) - -#pragma once - -#include "framework/cl/cl_helper.h" -#include "operators/op_param.h" -namespace paddle_mobile { -namespace operators { -void InstanceNorm(framework::CLHelper *cl_helper, - const framework::CLImage *input, framework::CLImage *output, - float epsilon); -} -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl deleted file mode 100644 index 9d0857a45e..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void batchnorm(__private const int out_width, - __read_only image2d_t input, - __read_only image2d_t new_scale_image, - __read_only image2d_t new_bias_image, - __write_only image2d_t output) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0)); - half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0)); - - int pos_x = mad24(out_c, out_width, out_w); - half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh)); - half4 out = mad(in, new_scale, new_bias); - - write_imageh(output, (int2)(pos_x, out_nh), out); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl deleted file mode 100644 index fa504a6ed1..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void bilinear_interp( - __read_only image2d_t input, __write_only image2d_t output, - __private const float scale_h, __private const float scale_w, - __private const int in_dims_h, __private const int out_dims_h, - __private const int in_dims_w, __private const int out_dims_w, - __private const float align_delta) { - const int c = get_global_id(0); - const int w = get_global_id(1); - const int nh = get_global_id(2); - - int2 output_pos; - output_pos.x = c * out_dims_w + w; - output_pos.y = nh; - - // calculate center pixel's pos - int out_n = nh / out_dims_h; - int out_h = nh % out_dims_h; - float center_w = (w + align_delta) * scale_w - align_delta; - float center_h = (out_h + align_delta) * scale_h - align_delta; - - int floor_w = (int)center_w; - int floor_h = (int)center_h; - int ceil_w = floor_w + 1; - int ceil_h = floor_h + 1; - - if (ceil_w > in_dims_w) { - ceil_w = floor_w; - } - if (ceil_h > in_dims_h) { - ceil_h = floor_h; - } - float wight0_w = center_w - floor_w; - float wight0_h = center_h - floor_h; - float wight1_w = 1.0f - wight0_w; - float wight1_h = 1.0f - wight0_h; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - // get left up pixel data - int2 left_up; - left_up.x = c * in_dims_w + floor_w; - left_up.y = out_n * in_dims_h + ceil_h; - half4 left_up_data = read_imageh(input, sampler, left_up); - - // get left down pixel data - int2 left_down; - left_down.x = c * in_dims_w + floor_w; - left_down.y = out_n * in_dims_h + floor_h; - half4 left_down_data = read_imageh(input, sampler, left_down); - - // get right up pixel data - int2 right_up; - right_up.x = c * in_dims_w + ceil_w; - right_up.y = out_n * in_dims_h + ceil_h; - half4 right_up_data = read_imageh(input, sampler, right_up); - - // get right down pixel's data - int2 right_down; - right_down.x = c * in_dims_w + ceil_w; - right_down.y = out_n * in_dims_h + floor_h; - half4 right_down_data = read_imageh(input, sampler, right_down); - - // calculate output data - half4 data = - (left_down_data * (half)wight1_w + right_down_data * (half)wight0_w) * - (half)wight1_h + - (left_up_data * (half)wight1_w + right_up_data * (half)wight0_w) * - (half)wight0_h; - - write_imageh(output, output_pos, data); -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl deleted file mode 100644 index 60000c994e..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void box_decoder(__read_only image2d_t prior_box_image, - __read_only image2d_t prior_box_var_image, - __read_only image2d_t target_box_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H - ){ - const int out_c = get_global_id(0); - const int out_nh = get_global_id(1); - const int out_h = out_nh%out_H; - const int out_n = 1; - - const int prior_box_n = 1; - const int prior_box_c = 0; - const int prior_box_h = out_h; - - - const int prior_box_var_n = 1; - const int prior_box_var_c = 0; - const int prior_box_var_h = out_h; - - const int target_box_n = 1; - const int target_box_c = out_c; - const int target_box_h = out_h; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - int2 prior_box_pos; - int2 prior_box_var_pos; - int2 target_box_pos; - int2 output_pos; - - prior_box_pos.x = prior_box_c * 4; - prior_box_pos.y = prior_box_n * prior_box_h; - - prior_box_var_pos.x = prior_box_var_c * 4; - prior_box_var_pos.y = prior_box_var_n * prior_box_var_h; - - target_box_pos.x = target_box_c * 4; - target_box_pos.y = target_box_n * target_box_h; - - output_pos.x = out_c * 4; - output_pos.y = out_n * out_h; - - half4 prior_box_input[4]; - half4 prior_box_var_input[4]; - half4 target_box_input[4]; - - prior_box_input[0] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 0,prior_box_pos.y)); - prior_box_input[1] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 1,prior_box_pos.y)); - prior_box_input[2] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 2,prior_box_pos.y)); - prior_box_input[3] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 3,prior_box_pos.y)); - - prior_box_var_input[0] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 0,prior_box_var_pos.y)); - prior_box_var_input[1] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 1,prior_box_var_pos.y)); - prior_box_var_input[2] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 2,prior_box_var_pos.y)); - prior_box_var_input[3] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 3,prior_box_var_pos.y)); - - - - target_box_input[0] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 0,target_box_pos.y)); - target_box_input[1] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 1,target_box_pos.y)); - target_box_input[2] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 2,target_box_pos.y)); - target_box_input[3] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 3,target_box_pos.y)); - - half prior_box_width = prior_box_input[2].x - prior_box_input[0].x; - half prior_box_height = prior_box_input[3].x - prior_box_input[1].x; - half prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(half)2; - half prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(half)2; - - half4 target_box_center_x; - half4 target_box_center_y; - half4 target_box_width; - half4 target_box_height; - half4 output[4]; - - output[0] = 0.0f; - output[1] = 0.0f; - output[2] = 0.0f; - output[3] = 0.0f; - - target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x; - target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y; - target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width; - target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height; - - output[0].x = target_box_center_x.x - target_box_width.x/(half)2; - output[1].x = target_box_center_y.x - target_box_height.x/(half)2; - output[2].x = target_box_center_x.x + target_box_width.x/(half)2; - output[3].x = target_box_center_y.x + target_box_height.x/(half)2; - - if(out_C - out_c * 4 >= 2){ - target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x; - target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y; - target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width; - target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height; - output[0].y = target_box_center_x.y - target_box_width.y/(half)2; - output[1].y = target_box_center_y.y - target_box_height.y/(half)2; - output[2].y = target_box_center_x.y + target_box_width.y/(half)2; - output[3].y = target_box_center_y.y + target_box_height.y/(half)2; - - } - if(out_C - out_c * 4 >= 3){ - target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x; - target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y; - target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width; - target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height; - output[0].z = target_box_center_x.z - target_box_width.z/(half)2; - output[1].z = target_box_center_y.z - target_box_height.z/(half)2; - output[2].z = target_box_center_x.z + target_box_width.z/(half)2; - output[3].z = target_box_center_y.z + target_box_height.z/(half)2; - } - if(out_C - out_c * 4 >= 4){ - target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x; - target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y; - target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width; - target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height; - output[0].w = target_box_center_x.w - target_box_width.w/(half)2; - output[1].w = target_box_center_y.w - target_box_height.w/(half)2; - output[2].w = target_box_center_x.w + target_box_width.w/(half)2; - output[3].w = target_box_center_y.w + target_box_height.w/(half)2; - } - - - write_imageh(output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]); - write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]); - write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]); - write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]); - -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl deleted file mode 100644 index 964cc7e75d..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - int2 coords_bias; - coords_bias.x = x/w; - coords_bias.y = 0; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords_bias); - half4 output = in + biase; - write_imageh(outputImage,coords,output); - } - -__kernel void width_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t -outputImage,int w) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - int2 coords_bias; - coords_bias.x = x % w; - coords_bias.y = 0; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords_bias); - half4 output; - output.x = in.x + biase.x; - output.y = in.y + biase.x; - output.z = in.z + biase.x; - output.w = in.w + biase.x; - write_imageh(outputImage,coords,output); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h b/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h deleted file mode 100644 index 34f36eb9a3..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -inline half4 activation(half4 in -#ifdef PRELU - , - half4 prelu_alpha -#endif -) { - half4 output; -#ifdef PRELU - output = select(prelu_alpha * in, in, in >= (half4)0.0); -#endif - -#ifdef RELU - output = fmax(in, (half4)(0.0f)); -#endif - return output; -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl deleted file mode 100644 index c636bf5fd4..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - - -__kernel void concatByCWith2Inputs(__read_only image2d_t input_image_0, - __read_only image2d_t input_image_1, - __private const int C_0, - __private const int C_1, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_W) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - half4 output_data; - - for (int i = 0; i < 4; i++) { - int c = out_c * 4 + i; - if (c >= out_C) { - break; - } - int c_in; - half4 input_data; - if (c < C_0) { - c_in = c; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_0, sampler, input_pos); - } else { - c_in = c - C_0; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_1, sampler, input_pos); - } - int value_offset = c_in % 4; - float value; - if (value_offset == 0) { - value = input_data.x; - } else if (value_offset == 1) { - value = input_data.y; - } else if (value_offset == 2) { - value = input_data.z; - } else if (value_offset == 3) { - value = input_data.w; - } - if (i == 0) { - output_data.x = value; - } else if (i == 1) { - output_data.y = value; - } else if (i == 2) { - output_data.z = value; - } else if (i == 3) { - output_data.w = value; - } - } - write_imageh(output_image, output_pos, output_data); -} - -__kernel void concatByCWith3Inputs(__read_only image2d_t input_image_0, - __read_only image2d_t input_image_1, - __read_only image2d_t input_image_2, - __private const int C_0, - __private const int C_1, - __private const int C_2, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_W) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - half4 output_data; - - for (int i = 0; i < 4; i++) { - int c = out_c * 4 + i; - if (c >= out_C) { - break; - } - int c_in; - half4 input_data; - if (c < C_0) { - c_in = c; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_0, sampler, input_pos); - } else if (c < C_0 + C_1) { - c_in = c - C_0; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_1, sampler, input_pos); - } else { - c_in = c - C_0 - C_1; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_2, sampler, input_pos); - } - int value_offset = c_in % 4; - float value; - if (value_offset == 0) { - value = input_data.x; - } else if (value_offset == 1) { - value = input_data.y; - } else if (value_offset == 2) { - value = input_data.z; - } else if (value_offset == 3) { - value = input_data.w; - } - if (i == 0) { - output_data.x = value; - } else if (i == 1) { - output_data.y = value; - } else if (i == 2) { - output_data.z = value; - } else if (i == 3) { - output_data.w = value; - } - } - write_imageh(output_image, output_pos, output_data); -} - - -__kernel void concatByCWith4Inputs(__read_only image2d_t input_image_0, - __read_only image2d_t input_image_1, - __read_only image2d_t input_image_2, - __read_only image2d_t input_image_3, - __private const int C_0, - __private const int C_1, - __private const int C_2, - __private const int C_3, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_W) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - half4 output_data; - - for (int i = 0; i < 4; i++) { - int c = out_c * 4 + i; - if (c >= out_C) { - break; - } - int c_in; - half4 input_data; - if (c < C_0) { - c_in = c; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_0, sampler, input_pos); - } else if (c < C_0 + C_1) { - c_in = c - C_0; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_1, sampler, input_pos); - } else if (c < C_0 + C_1 + C_2) { - c_in = c - C_0 - C_1; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_2, sampler, input_pos); - }else if (c < C_0 + C_1 + C_2 + C_3){ - c_in = c - C_0 - C_1 - C_2; - int2 input_pos; - input_pos.x = (c_in / 4) * out_W + out_w; - input_pos.y = out_nh; - input_data = read_imageh(input_image_3, sampler, input_pos); - } - int value_offset = c_in % 4; - float value; - if (value_offset == 0) { - value = input_data.x; - } else if (value_offset == 1) { - value = input_data.y; - } else if (value_offset == 2) { - value = input_data.z; - } else if (value_offset == 3) { - value = input_data.w; - } - if (i == 0) { - output_data.x = value; - } else if (i == 1) { - output_data.y = value; - } else if (i == 2) { - output_data.z = value; - } else if (i == 3) { - output_data.w = value; - } - } - write_imageh(output_image, output_pos, output_data); -} - -__kernel void concatByH(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_W, - __private const int out_H_Start) { - - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - - int2 input_pos; - input_pos.x = in_c * out_W + in_w; - input_pos.y = in_nh; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 input; - input = read_imageh(input_image, sampler,input_pos); - - int2 output_pos; - output_pos.x = input_pos.x; - output_pos.y = out_H_Start + input_pos.y; - - write_imageh(output_image, output_pos, input); - -} - -__kernel void concatByW(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int in_W, - __private const int pre_Width, - __private const int out_Width) { - - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - - int2 input_pos; - input_pos.x = in_c * in_W + in_w; - input_pos.y = in_nh; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 input; - input = read_imageh(input_image, sampler,input_pos); - - int2 output_pos; - output_pos.x = input_pos.x + pre_Width + out_Width * in_c; - output_pos.y = input_pos.y; - write_imageh(output_image, output_pos, input); - -} - - - - diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl deleted file mode 100644 index 2a5c823295..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ /dev/null @@ -1,15 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "conv_kernel.inc.cl" diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl deleted file mode 100644 index bf31f32970..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ /dev/null @@ -1,2836 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* -conv -conv_bn -conv_add -conv_relu -conv_bn_relu -conv_add_relu -conv_add_bn_relu -*/ - -#include "cl_common.h" - -__kernel void conv_3x3( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif - -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height, - __private const int output_c, __private const int filter_channel, - __private const int group) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; - - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - -#ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); -#else - half4 output = 0.0f; -#endif - - half4 input[9]; - if (group == 1) { - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, - in_pos_in_one_block.y); - input[0] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - - input[1] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - - input[2] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - - input[3] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - - input[4] = select( - read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - - input[5] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - - input[6] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - input[7] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - input[8] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - /* - for (int j = 0; j < 9; ++j) { - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, - pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, - pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, - pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, - pos_of_weight); - output.w += dot(input[j], weight_w); - } - */ - int j = 0; - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 1; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 2; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 3; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 4; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 5; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 6; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 7; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 8; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - } - } else { - for (int i = 0; i < 4; i++) { - int used_input_channel_num = - (out_c * 4 + i) / (output_c / group) * filter_channel; - for (int f_c = 0; f_c < filter_channel; ++f_c) { - int input_c = used_input_channel_num + f_c; - int input_block = input_c / 4; - int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, - in_pos_in_one_block.y); - input[0] = select( - read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[1] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[2] = select( - read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - input[3] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - input[4] = select( - read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - input[5] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - input[6] = select( - read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - input[7] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - input[8] = select( - read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - half tmp_out = 0; - for (int j = 0; j < 9; j++) { - int2 pos_of_weight; - pos_of_weight.x = (f_c / 4) * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; - half4 weight = read_imageh(filter, sampler, pos_of_weight); - int f_c_offset = f_c % 4; - half f_value; - if (f_c_offset == 0) { - f_value = weight.x; - } else if (f_c_offset == 1) { - f_value = weight.y; - } else if (f_c_offset == 2) { - f_value = weight.z; - } else if (f_c_offset == 3) { - f_value = weight.w; - } - int input_c_offset = input_c % 4; - half input_value; - if (input_c_offset == 0) { - input_value = input[j].x; - } else if (input_c_offset == 1) { - input_value = input[j].y; - } else if (input_c_offset == 2) { - input_value = input[j].z; - } else if (input_c_offset == 3) { - input_value = input[j].w; - } - tmp_out += f_value * input_value; - } - - if (i == 0) { - output.x += tmp_out; - } else if (i == 1) { - output.y += tmp_out; - } else if (i == 2) { - output.z += tmp_out; - } else if (i == 3) { - output.w += tmp_out; - } - } - } - } - -#ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output = activation(output); -#endif - - write_imageh(output_image, output_pos, output); -} - -// dilation == 1 -__kernel void conv_3x3spl( - __private const int item_ch, __private const int item_w, - __private const int item_h, __read_only image2d_t input_image, - __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int pad, __private const int dilation, - __private const int in_ch, __private const int in_w, - __private const int in_h, __private const int out_w, - __private const int out_h) { - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / in_h; - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; - -#ifdef BIASE_CH - - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; - -#elif defined(BIASE_ELE) - - half4 output[5]; - output[0] = - read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = read_imageh(bias, sampler, - (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = read_imageh(bias, sampler, - (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = read_imageh(bias, sampler, - (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = read_imageh(bias, sampler, - (int2)(out_w_base_id + out_w_id4, item_h_id)); - } -#else - half4 output[5] = {0.0f}; -#endif - - half4 filter[4] = {0.0f}; - half4 filter_trans[4] = {0.0f}; - half4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * 3; - int filter_h_val1 = filter_h_val0 + 3; - int filter_h_val2 = filter_h_val1 + 3; - int filter_h_val3 = filter_h_val2 + 3; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch * 3; - - for (int h = 0; h < 3; h++) { - int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, - (out_batch_id * in_h + in_h_id + h < 0 || - out_batch_id * in_h + in_h_id + h >= in_h)); - - for (int w = 0; w < 3; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = read_imageh( - filter_image, sampler, - (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = read_imageh( - filter_image, sampler, - (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = read_imageh( - filter_image, sampler, - (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = read_imageh( - filter_image, sampler, - (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, - filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, - filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, - filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, - filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = - read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); - input[1] = - read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); - input[2] = - read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); - input[3] = - read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); - input[4] = - read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - } - } - } - } -#ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output[0] = mad(scale, output[0], biase); - if (out_w_id1 < out_w) { - output[1] = mad(scale, output[1], biase); - } - if (out_w_id2 < out_w) { - output[2] = mad(scale, output[2], biase); - } - if (out_w_id3 < out_w) { - output[3] = mad(scale, output[3], biase); - } - if (out_w_id4 < out_w) { - output[4] = mad(scale, output[4], biase); - } -#endif - -#ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); - output[2] = activation(output[2]); - output[3] = activation(output[3]); - output[4] = activation(output[4]); -#endif - write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); - if (out_w_id1 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); - } - if (out_w_id2 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), - output[2]); - } - if (out_w_id3 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), - output[3]); - } - if (out_w_id4 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), - output[4]); - } -} - -__kernel void depth_conv_3x3( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - const int batch_index = out_nh / output_height; - - const int out_nh_in_one_batch = out_nh % output_height; - - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - - int2 in_pos_in_one_block = - ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); - -#ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); -#else - half4 output = 0.0f; -#endif - - const int filter_width = 3; - const int filter_height = 3; - - int2 pos_in_input_block = - (int2)(out_c * input_width, batch_index * input_height); - - int2 pos_in_filter_block = - (int2)(out_c * filter_width, batch_index * filter_height); - - int filter_x = pos_in_filter_block.x; - int filter_y = pos_in_filter_block.y; - - half4 inputs[9]; - - inputs[0] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, - pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 || - in_pos_in_one_block.y - 1 < 0 || - in_pos_in_one_block.x - 1 >= input_width || - in_pos_in_one_block.y - 1 >= input_height) - << 15)); - - inputs[1] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x, - pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y - 1 >= input_height) - << 15)); - - inputs[2] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, - pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 || - in_pos_in_one_block.y - 1 < 0 || - in_pos_in_one_block.x + 1 >= input_width || - in_pos_in_one_block.y - 1 >= input_height) - << 15)); - - inputs[3] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, - pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x - 1 >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - /* - if (output_pos.x == 112 && output_pos.y == 0) { - half4 input1 = inputs[3]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 3 - %v4hlf \n", in); - printf(" --- %d ---\n", in_pos_in_one_block.x - 1); - } - */ - - inputs[4] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x, - pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - - inputs[5] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, - pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x + 1 >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - - inputs[6] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, - pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 || - in_pos_in_one_block.y + 1 < 0 || - in_pos_in_one_block.x - 1 >= input_width || - in_pos_in_one_block.y + 1 >= input_height) - << 15)); - - inputs[7] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x, - pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y + 1 >= input_height) - << 15)); - - inputs[8] = select( - read_imageh(input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, - pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 || - in_pos_in_one_block.y + 1 < 0 || - in_pos_in_one_block.x + 1 >= input_width || - in_pos_in_one_block.y + 1 >= input_height) - << 15)); - - half4 filters[9]; - filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y)); - filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y)); - filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y)); - filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1)); - filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1)); - filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1)); - filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2)); - filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2)); - filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2)); - - for (int i = 0; i < 9; i++) { - output += inputs[i] * filters[i]; - } -#ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output = activation(output); -#endif - - /* - if (output_pos.x == 112 && output_pos.y == 0) { - for (int i = 0; i < 9; ++i) { - half4 input1 = inputs[i]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 %d - %v4hlf \n", i, in); - } - float4 out = (float4)(output.x, output.y, output.z, output.w); - printf(" depth wise output output4 = %v4hlf \n", out); - printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); - printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); - printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); - printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); - } - */ - - write_imageh(output_image, output_pos, output); -} - -__kernel void depth_conv_3x3s1( - __private const int ou_ch_blk, __private const int ou_w_blk, - __private const int ou_nh, __read_only image2d_t input, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int pad, __private const int dilation, - __private const int in_ch, __private const int in_w, /* of one block */ - __private const int in_h, /* of one block */ - __private const int ou_w, __private const int ou_h) { - - const int ou_ch_blk_id = get_global_id(0); - const int ou_w_blk_id = get_global_id(1); - const int ou_nh_id = get_global_id(2); - const int w_blk_size = 2; - - const int batch_id = ou_nh_id / ou_h; - int ou_col_id = ou_w_blk_id * w_blk_size; - int ou_row_id = ou_nh_id % ou_h; - int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); - - // input pos in one block and on batch - int col_id = ou_col_id - pad; - int row_id = ou_row_id - pad; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - -#ifdef BIASE_CH - half4 output[2]; - output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0)); - output[1] = output[0]; -#elif defined(BIASE_ELE) - half4 output[2]; - output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id)); - if (ou_col_id + 1 < ou_w) { - output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id)); - } -#else - half4 output[2] = {0.0f}; -#endif - - half4 inputs[12]; - - int filter_x = ou_ch_blk_id * 3; - int filter_y = 0; - half4 filters[9]; - filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y)); - filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y)); - filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y)); - - int in_x = mad24(ou_ch_blk_id, in_w, col_id); - int in_y = mad24(batch_id, in_h, row_id); - - int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); - int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); - inputs[0] = read_imageh(input, sampler, (int2)(x0, y0)); - int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); - inputs[1] = read_imageh(input, sampler, (int2)(x1, y0)); - int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); - inputs[2] = read_imageh(input, sampler, (int2)(x2, y0)); - int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); - inputs[3] = read_imageh(input, sampler, (int2)(x3, y0)); - - output[0] = mad(inputs[0], filters[0], output[0]); - output[1] = mad(inputs[1], filters[0], output[1]); - - output[0] = mad(inputs[1], filters[1], output[0]); - output[1] = mad(inputs[2], filters[1], output[1]); - - output[0] = mad(inputs[2], filters[2], output[0]); - output[1] = mad(inputs[3], filters[2], output[1]); - - filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1)); - filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1)); - filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1)); - - int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); - inputs[4] = read_imageh(input, sampler, (int2)(x0, y1)); - inputs[5] = read_imageh(input, sampler, (int2)(x1, y1)); - inputs[6] = read_imageh(input, sampler, (int2)(x2, y1)); - inputs[7] = read_imageh(input, sampler, (int2)(x3, y1)); - - output[0] = mad(inputs[4], filters[3], output[0]); - output[1] = mad(inputs[5], filters[3], output[1]); - - output[0] = mad(inputs[5], filters[4], output[0]); - output[1] = mad(inputs[6], filters[4], output[1]); - - output[0] = mad(inputs[6], filters[5], output[0]); - output[1] = mad(inputs[7], filters[5], output[1]); - - filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2)); - filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2)); - filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2)); - - int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); - inputs[8] = read_imageh(input, sampler, (int2)(x0, y2)); - inputs[9] = read_imageh(input, sampler, (int2)(x1, y2)); - inputs[10] = read_imageh(input, sampler, (int2)(x2, y2)); - inputs[11] = read_imageh(input, sampler, (int2)(x3, y2)); - - output[0] = mad(inputs[8], filters[6], output[0]); - output[1] = mad(inputs[9], filters[6], output[1]); - - output[0] = mad(inputs[9], filters[7], output[0]); - output[1] = mad(inputs[10], filters[7], output[1]); - - output[0] = mad(inputs[10], filters[8], output[0]); - output[1] = mad(inputs[11], filters[8], output[1]); -#ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0)); - output[0] = mad(scale, output[0], biase); - if (ou_col_id + 1 < ou_w) { - output[1] = mad(scale, output[1], biase); - } -#endif - -#ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); -#endif - - write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]); - if (ou_col_id + 1 < ou_w) { - write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); - } -} - -__kernel void conv_1x1( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - const uint kernelHXW = 1; - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); - int2 in_pos_in_one_block = - ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); - -#ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); -#else - half4 output = 0.0f; -#endif - - for (int i = 0; i < input_c; ++i) { - int2 pos_in = - (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - half4 input = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - /* - output.x = dot(input, weight0); - output.y = dot(input, weight1); - output.z = dot(input, weight2); - output.w = dot(input, weight3); - */ - - output = mad(input.x, weight0, output); - output = mad(input.y, weight1, output); - output = mad(input.z, weight2, output); - output = mad(input.w, weight3, output); - } - -#ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output = activation(output); -#endif - - write_imageh(output_image, output_pos, output); -} -__kernel void conv_1x1_simple( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int input_c_origin, __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height, - __private const int old_w) { - half zero = 0.0f; - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int out_w0 = out_w; - int out_w1 = out_w + global_size_dim1; - int out_w2 = out_w + global_size_dim1 * 2; - int out_w3 = out_w + global_size_dim1 * 3; - - int outpos_main = mul24(out_c, old_w); - int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); - int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); - int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); - int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 stride_xy = (int2)(stride, stride); - - int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); - int2 in_pos_in_one_block0 = - ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); - int2 in_pos_in_one_block1 = - ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); - int2 in_pos_in_one_block2 = - ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); - int2 in_pos_in_one_block3 = - ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); - -#ifdef BIASE_CH - half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = output0; - half4 output2 = output0; - half4 output3 = output0; -#elif defined(BIASE_ELE) - half4 output0 = read_imageh(bias, sampler, output_pos0); - half4 output1 = output0; - half4 output2 = output0; - half4 output3 = output0; - -#else - half4 output0 = 0.0f; - half4 output1 = 0.0f; - half4 output2 = 0.0f; - half4 output3 = 0.0f; -#endif - - for (int i = 0; i < input_c; ++i) { - // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, - in_pos_in_one_block0.y); - half4 input0 = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); - // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, - in_pos_in_one_block1.y); - half4 input1 = read_imageh(input_image, sampler, pos_in); - - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); - - // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, - in_pos_in_one_block2.y); - half4 input2 = read_imageh(input_image, sampler, pos_in); - - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(input2.w, weight3, output2); - - // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, - in_pos_in_one_block3.y); - half4 input3 = read_imageh(input_image, sampler, pos_in); - - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(input3.w, weight3, output3); - } - -#ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); -#endif - - if (out_w0 < old_w) { - write_imageh(output_image, output_pos0, output0); - } - - if (out_w1 < old_w) { - write_imageh(output_image, output_pos1, output1); - } - - if (out_w2 < old_w) { - write_imageh(output_image, output_pos2, output2); - } - - if (out_w3 < old_w) { - write_imageh(output_image, output_pos3, output3); - } -} -__kernel void conv_1x1_wrapped( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int input_c_origin, __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height, - __private const int old_w) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int out_w0 = out_w; - int out_w1 = out_w + global_size_dim1; - int out_w2 = out_w + global_size_dim1 * 2; - int out_w3 = out_w + global_size_dim1 * 3; - - int outpos_main = mul24(out_c, old_w); - int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); - int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); - int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); - int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 stride_xy = (int2)(stride, stride); - - int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); - int2 in_pos_in_one_block0 = - ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); - int2 in_pos_in_one_block1 = - ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); - int2 in_pos_in_one_block2 = - ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); - int2 in_pos_in_one_block3 = - ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); - -#ifdef BIASE_CH - half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - half4 output0 = read_imageh(bias, sampler, output_pos0); - half4 output1 = read_imageh(bias, sampler, output_pos1); - half4 output2 = read_imageh(bias, sampler, output_pos2); - half4 output3 = read_imageh(bias, sampler, output_pos3); - -#else - half4 output0 = 0.0f; - half4 output1 = 0.0f; - half4 output2 = 0.0f; - half4 output3 = 0.0f; -#endif - - int max_w_bound = input_c * input_width; - int burndary_index = input_c * 4 - input_c_origin; - for (int i = 0; i < input_c; ++i) { - // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, - in_pos_in_one_block0.y); - half4 input0 = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - - if ((max_w_bound - pos_in.x - 1) < input_width && - (max_w_bound - pos_in.x - 1) >= 0) { - if (burndary_index == 0) { - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); - } else if (burndary_index == 1) { - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(0.0f, weight3, output0); - - } else if (burndary_index == 2) { - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(0.0f, weight2, output0); - output0 = mad(0.0f, weight3, output0); - } else if (burndary_index == 3) { - output0 = mad(input0.x, weight0, output0); - output0 = mad(0.0f, weight1, output0); - output0 = mad(0.0f, weight2, output0); - output0 = mad(0.0f, weight3, output0); - } - } else { - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); - } - - // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, - in_pos_in_one_block1.y); - half4 input1 = read_imageh(input_image, sampler, pos_in); - - if (abs(max_w_bound - pos_in.x) < input_width) { - if (burndary_index == 0) { - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); - } else if (burndary_index == 1) { - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(0.0f, weight3, output1); - - } else if (burndary_index == 2) { - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(0.0f, weight2, output1); - output1 = mad(0.0f, weight3, output1); - } else if (burndary_index == 3) { - output1 = mad(input1.x, weight0, output1); - output1 = mad(0.0f, weight1, output1); - output1 = mad(0.0f, weight2, output1); - output1 = mad(0.0f, weight3, output1); - } - } else { - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); - } - - // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, - in_pos_in_one_block2.y); - half4 input2 = read_imageh(input_image, sampler, pos_in); - - if (abs(max_w_bound - pos_in.x) < input_width) { - if (burndary_index == 0) { - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(input2.w, weight3, output2); - } else if (burndary_index == 1) { - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(0.0f, weight3, output2); - - } else if (burndary_index == 2) { - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(0.0f, weight2, output2); - output2 = mad(0.0f, weight3, output2); - } else if (burndary_index == 3) { - output2 = mad(input2.x, weight0, output2); - output2 = mad(0.0f, weight1, output2); - output2 = mad(0.0f, weight2, output2); - output2 = mad(0.0f, weight3, output2); - } - } else { - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(input2.w, weight3, output2); - } - - // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, - in_pos_in_one_block3.y); - half4 input3 = read_imageh(input_image, sampler, pos_in); - - if (abs(max_w_bound - pos_in.x) < input_width) { - if (burndary_index == 0) { - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(input3.w, weight3, output3); - } else if (burndary_index == 1) { - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(0.0f, weight3, output3); - - } else if (burndary_index == 2) { - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(0.0f, weight2, output3); - output3 = mad(0.0f, weight3, output3); - } else if (burndary_index == 3) { - output3 = mad(input3.x, weight0, output3); - output3 = mad(0.0f, weight1, output3); - output3 = mad(0.0f, weight2, output3); - output3 = mad(0.0f, weight3, output3); - } - } else { - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(input3.w, weight3, output3); - } - } - -#ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); -#endif - - if (out_w0 < old_w) { - write_imageh(output_image, output_pos0, output0); - } - - if (out_w1 < old_w) { - write_imageh(output_image, output_pos1, output1); - } - - if (out_w2 < old_w) { - write_imageh(output_image, output_pos2, output2); - } - - if (out_w3 < old_w) { - write_imageh(output_image, output_pos3, output3); - } -} - -__kernel void conv_7x7( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif - -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const int filter_n0 = 4 * out_c + 0; - const int filter_n1 = 4 * out_c + 1; - const int filter_n2 = 4 * out_c + 2; - const int filter_n3 = 4 * out_c + 3; - - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; - - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - -#ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); -#else - half4 output = 0.0f; -#endif - - half4 input; - half4 filter[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = - (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for (int j = 0; j < 7; j++) { - for (int k = 0; k < 7; k++) { - input = select( - read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 3) * dilation, - pos_in.y + (k - 3) * dilation)), - (half4)(0.0f), - (ushort4)( - (in_pos_in_one_block.x + (j - 3) * dilation < 0 || - in_pos_in_one_block.y + (k - 3) * dilation < 0 || - in_pos_in_one_block.x + (j - 3) * dilation >= input_width || - in_pos_in_one_block.y + (k - 3) * dilation >= input_height) - << 15)); - int filter_h = k; - int filter_w = j; - int filter_c = i; - - filter_pos0.x = filter_c * 7 + filter_w; - filter_pos0.y = filter_n0 * 7 + filter_h; - - filter_pos1.x = filter_c * 7 + filter_w; - filter_pos1.y = filter_n1 * 7 + filter_h; - - filter_pos2.x = filter_c * 7 + filter_w; - filter_pos2.y = filter_n2 * 7 + filter_h; - - filter_pos3.x = filter_c * 7 + filter_w; - filter_pos3.y = filter_n3 * 7 + filter_h; - - filter[0] = read_imageh(filter_image, sampler, filter_pos0); - filter[1] = read_imageh(filter_image, sampler, filter_pos1); - filter[2] = read_imageh(filter_image, sampler, filter_pos2); - filter[3] = read_imageh(filter_image, sampler, filter_pos3); - - output.x += dot(input, filter[0]); - output.y += dot(input, filter[1]); - output.z += dot(input, filter[2]); - output.w += dot(input, filter[3]); - } - } - } - -#ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output = activation(output); -#endif - - write_imageh(output_image, output_pos, output); -} - -__kernel void conv_7x7Pt1x2( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif - -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w1 = get_global_id(1); - const int out_nh = get_global_id(2); - - if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const int out_w = out_w1 * 2; - - int2 output_pos = (int2)(out_c * output_width + out_w, out_nh); - - const int filter_n0 = 4 * out_c + 0; - const int filter_n1 = 4 * out_c + 1; - const int filter_n2 = 4 * out_c + 2; - const int filter_n3 = 4 * out_c + 3; - - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; - - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - - half4 output0 = 0.0f; - half4 output1 = 0.0f; -#ifdef BIASE_CH - output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - output1 = output0; -#elif defined(BIASE_ELE) - output0 = read_imageh(bias, sampler, output_pos); - output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y)); -#else - output0 = 0.0f; - output1 = 0.0f; -#endif - - half4 input[8]; - half4 filter0[4]; - half4 filter1[4]; - half4 filter2[4]; - half4 filter3[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = - (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for (int k = 0; k < 7; k++) { - for (int j = 0; j < 8; j++) { - input[j] = select( - read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 3) * dilation, - pos_in.y + (k - 3) * dilation)), - (half4)(0.0f), - (ushort4)( - (in_pos_in_one_block.x + (j - 3) * dilation < 0 || - in_pos_in_one_block.y + (k - 3) * dilation < 0 || - in_pos_in_one_block.x + (j - 3) * dilation >= input_width || - in_pos_in_one_block.y + (k - 3) * dilation >= input_height) - << 15)); - - int filter_h = k; - int filter_w = j; - int filter_c = i; - - if (j < 7) { - filter_pos0.x = filter_c * 7 + filter_w; - filter_pos0.y = filter_n0 * 7 + filter_h; - - filter_pos1.x = filter_c * 7 + filter_w; - filter_pos1.y = filter_n1 * 7 + filter_h; - - filter_pos2.x = filter_c * 7 + filter_w; - filter_pos2.y = filter_n2 * 7 + filter_h; - - filter_pos3.x = filter_c * 7 + filter_w; - filter_pos3.y = filter_n3 * 7 + filter_h; - - filter0[0] = read_imageh(filter_image, sampler, filter_pos0); - filter0[1] = read_imageh(filter_image, sampler, filter_pos1); - filter0[2] = read_imageh(filter_image, sampler, filter_pos2); - filter0[3] = read_imageh(filter_image, sampler, filter_pos3); - - output0.x += dot(input[j], filter0[0]); - output0.y += dot(input[j], filter0[1]); - output0.z += dot(input[j], filter0[2]); - output0.w += dot(input[j], filter0[3]); - } - - if (j > 0) { - output1.x += dot(input[j], filter1[0]); - output1.y += dot(input[j], filter1[1]); - output1.z += dot(input[j], filter1[2]); - output1.w += dot(input[j], filter1[3]); - } - - filter1[0] = filter0[0]; - filter1[1] = filter0[1]; - filter1[2] = filter0[2]; - filter1[3] = filter0[3]; - } - } - } - -#ifdef BATCH_NORM - half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0)); - half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output0 = output0 * s + b; - output1 = output1 * s + b; -#endif - -#ifdef RELU - output0 = activation(output0); - output1 = activation(output1); -#endif - write_imageh(output_image, output_pos, output0); - if ((output_pos.x + 1) % output_width != 0) { - write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1); - } -} - -// dilation == 1 -__kernel void conv_7x7spl( - __private const int item_ch, __private const int item_w, - __private const int item_h, __read_only image2d_t input_image, - __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int pad, __private const int dilation, - __private const int in_ch, __private const int in_w, - __private const int in_h, __private const int out_w, - __private const int out_h) { - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - // filter - const int filter_w = 7; - const int filter_h = 7; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / in_h; - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; - -#ifdef BIASE_CH - - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; - -#elif defined(BIASE_ELE) - - half4 output[5]; - output[0] = - read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = read_imageh(bias, sampler, - (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = read_imageh(bias, sampler, - (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = read_imageh(bias, sampler, - (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = read_imageh(bias, sampler, - (int2)(out_w_base_id + out_w_id4, item_h_id)); - } -#else - half4 output[5] = {0.0f}; -#endif - - half4 filter[4] = {0.0f}; - half4 filter_trans[4] = {0.0f}; - half4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * filter_h; - int filter_h_val1 = filter_h_val0 + filter_h; - int filter_h_val2 = filter_h_val1 + filter_h; - int filter_h_val3 = filter_h_val2 + filter_h; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch * filter_w; - - for (int h = 0; h < filter_h; h++) { - int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, - (out_batch_id * in_h + in_h_id + h < 0 || - out_batch_id * in_h + in_h_id + h >= in_h)); - - for (int w = 0; w < filter_w; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = read_imageh( - filter_image, sampler, - (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = read_imageh( - filter_image, sampler, - (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = read_imageh( - filter_image, sampler, - (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = read_imageh( - filter_image, sampler, - (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, - filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, - filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, - filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, - filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = - read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); - input[1] = - read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); - input[2] = - read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); - input[3] = - read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); - input[4] = - read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - } - } - } - } -#ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output[0] = mad(scale, output[0], biase); - if (out_w_id1 < out_w) { - output[1] = mad(scale, output[1], biase); - } - if (out_w_id2 < out_w) { - output[2] = mad(scale, output[2], biase); - } - if (out_w_id3 < out_w) { - output[3] = mad(scale, output[3], biase); - } - if (out_w_id4 < out_w) { - output[4] = mad(scale, output[4], biase); - } -#endif - -#ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); - output[2] = activation(output[2]); - output[3] = activation(output[3]); - output[4] = activation(output[4]); -#endif - write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); - if (out_w_id1 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); - } - if (out_w_id2 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), - output[2]); - } - if (out_w_id3 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), - output[3]); - } - if (out_w_id4 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), - output[4]); - } -} - -__kernel void conv_5x5( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif - -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const filter_n0 = 4 * out_c + 0; - const filter_n1 = 4 * out_c + 1; - const filter_n2 = 4 * out_c + 2; - const filter_n3 = 4 * out_c + 3; - - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; - - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - -#ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); -#else - half4 output = 0.0f; -#endif - - half4 input; - half4 filter[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = - (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for (int j = 0; j < 5; j++) { - for (int k = 0; k < 5; k++) { - input = select( - read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 2) * dilation, - pos_in.y + (k - 2) * dilation)), - (half4)(0.0f), - (ushort4)( - (in_pos_in_one_block.x + (j - 2) * dilation < 0 || - in_pos_in_one_block.y + (k - 2) * dilation < 0 || - in_pos_in_one_block.x + (j - 2) * dilation >= input_width || - in_pos_in_one_block.y + (k - 2) * dilation >= input_height) - << 15)); - int filter_h = k; - int filter_w = j; - int filter_c = i; - - filter_pos0.x = filter_c * 5 + filter_w; - filter_pos0.y = filter_n0 * 5 + filter_h; - - filter_pos1.x = filter_c * 5 + filter_w; - filter_pos1.y = filter_n1 * 5 + filter_h; - - filter_pos2.x = filter_c * 5 + filter_w; - filter_pos2.y = filter_n2 * 5 + filter_h; - - filter_pos3.x = filter_c * 5 + filter_w; - filter_pos3.y = filter_n3 * 5 + filter_h; - - filter[0] = read_imageh(filter_image, sampler, filter_pos0); - filter[1] = read_imageh(filter_image, sampler, filter_pos1); - filter[2] = read_imageh(filter_image, sampler, filter_pos2); - filter[3] = read_imageh(filter_image, sampler, filter_pos3); - - output.x += dot(input, filter[0]); - output.y += dot(input, filter[1]); - output.z += dot(input, filter[2]); - output.w += dot(input, filter[3]); - } - } - } - -#ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output = activation(output); -#endif - - write_imageh(output_image, output_pos, output); -} - -__kernel void convBNAdd_3x3( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif - -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; - - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - - half4 output = (half4)0.0f; - - half4 input[9]; - - for (int i = 0; i < input_c; ++i) { - int2 pos_in = - (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - input[0] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - - input[1] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - - input[2] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y - dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y - dilation >= input_height) - << 15)); - - input[3] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - - input[4] = select( - read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - - input[5] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y >= input_height) - << 15)); - - input[6] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x - dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - input[7] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - input[8] = - select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || - in_pos_in_one_block.y + dilation < 0 || - in_pos_in_one_block.x + dilation >= input_width || - in_pos_in_one_block.y + dilation >= input_height) - << 15)); - - /* - for (int j = 0; j < 9; ++j) { - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - } - */ - int j = 0; - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 1; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 2; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 3; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 4; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 5; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 6; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 7; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 8; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - } - -#ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef BIASE_CH - output += read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - output += read_imageh(bias, sampler, output_pos); -#endif - -#ifdef RELU - output = activation(output); -#endif - - write_imageh(output_image, output_pos, output); -} - -__kernel void convBNAdd_1x1( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - const uint kernelHXW = 1; - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); - int2 in_pos_in_one_block = - ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); - - half4 output = 0.0f; - - for (int i = 0; i < input_c; ++i) { - int2 pos_in = - (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - half4 input = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - /* - output.x = dot(input, weight0); - output.y = dot(input, weight1); - output.z = dot(input, weight2); - output.w = dot(input, weight3); - */ - - output = mad(input.x, weight0, output); - output = mad(input.y, weight1, output); - output = mad(input.z, weight2, output); - output = mad(input.w, weight3, output); - } - -#ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef BIASE_CH - output += read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - output += read_imageh(bias, sampler, output_pos); -#endif - -#ifdef RELU - output = activation(output); -#endif - - write_imageh(output_image, output_pos, output); -} - -__kernel void convBNAdd_1x1_spl( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height, - __private const int old_w) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int out_w0 = out_w; - int out_w1 = out_w + global_size_dim1; - int out_w2 = out_w + global_size_dim1 * 2; - int out_w3 = out_w + global_size_dim1 * 3; - - int outpos_main = mul24(out_c, old_w); - int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); - int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); - int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); - int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int2 stride_xy = (int2)(stride, stride); - - int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); - int2 in_pos_in_one_block0 = - ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); - int2 in_pos_in_one_block1 = - ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); - int2 in_pos_in_one_block2 = - ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); - - int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); - int2 in_pos_in_one_block3 = - ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); - - half4 output0 = 0.0f; - half4 output1 = 0.0f; - half4 output2 = 0.0f; - half4 output3 = 0.0f; - - for (int i = 0; i < input_c; ++i) { - // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, - in_pos_in_one_block0.y); - half4 input0 = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - - output0 = mad(input0.x, weight0, output0); - output0 = mad(input0.y, weight1, output0); - output0 = mad(input0.z, weight2, output0); - output0 = mad(input0.w, weight3, output0); - - // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, - in_pos_in_one_block1.y); - half4 input1 = read_imageh(input_image, sampler, pos_in); - // - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output1 = mad(input1.x, weight0, output1); - output1 = mad(input1.y, weight1, output1); - output1 = mad(input1.z, weight2, output1); - output1 = mad(input1.w, weight3, output1); - - // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, - in_pos_in_one_block2.y); - half4 input2 = read_imageh(input_image, sampler, pos_in); - - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output2 = mad(input2.x, weight0, output2); - output2 = mad(input2.y, weight1, output2); - output2 = mad(input2.z, weight2, output2); - output2 = mad(input2.w, weight3, output2); - - // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, - in_pos_in_one_block3.y); - half4 input3 = read_imageh(input_image, sampler, pos_in); - - // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + - // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 - // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * - // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i - // * 4 + 3)); - - output3 = mad(input3.x, weight0, output3); - output3 = mad(input3.y, weight1, output3); - output3 = mad(input3.z, weight2, output3); - output3 = mad(input3.w, weight3, output3); - } - -#ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); - - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef BIASE_CH - output0 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output1 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output2 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output3 += read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - output0 += read_imageh(bias, sampler, output_pos0); - output1 += read_imageh(bias, sampler, output_pos1); - output2 += read_imageh(bias, sampler, output_pos2); - output3 += read_imageh(bias, sampler, output_pos3); -#endif - -#ifdef RELU - output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); -#endif - - if (out_w0 < old_w) { - write_imageh(output_image, output_pos0, output0); - } - - if (out_w1 < old_w) { - write_imageh(output_image, output_pos1, output1); - } - - if (out_w2 < old_w) { - write_imageh(output_image, output_pos2, output2); - } - - if (out_w3 < old_w) { - write_imageh(output_image, output_pos3, output3); - } -} - -__kernel void depth_conv( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input, - __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height, - __private const int filter_width, __private const int filter_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - const int batch_index = out_nh / output_height; - const int out_nh_in_one_batch = out_nh % output_height; - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - int2 in_pos_in_one_block = - ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); -#ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); -#else - half4 output = 0.0f; -#endif - - int2 pos_in_input_block = - (int2)(out_c * input_width, batch_index * input_height); - int2 pos_in_filter_block = - (int2)(out_c * filter_width, batch_index * filter_height); - int filter_x = pos_in_filter_block.x; - int filter_y = pos_in_filter_block.y; - int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x; - int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y; - int2 align = {filter_width / 2, filter_height / 2}; - /* if (output_pos.x == 0 && output_pos.y == 0){ - printf("align.x=%d align.y=%d \n ",align.x,align.y); - printf("stride=%d \n ",stride); - }*/ - for (int fy = 0; fy < filter_height; ++fy) { - for (int fx = 0; fx < filter_width; ++fx) { - int x_off = fx - align.x; - int y_off = fy - align.y; - /* if (output_pos.x == 0 && output_pos.y == 0){ - printf("fx=%d fy=%d \n ",fx,fy); - printf("x_off=%d y_off=%d \n ",x_off,y_off); - }*/ - half4 in = select( - read_imageh(input, sampler, - (int2)(input_x_base + x_off, input_y_base + y_off)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + x_off < 0 || - in_pos_in_one_block.y + y_off < 0 || - in_pos_in_one_block.x + x_off >= input_width || - in_pos_in_one_block.y + y_off >= input_height) - << 15)); - half4 f = - read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy)); - output += in * f; - /*if (output_pos.x ==111 && output_pos.y == 0){ - printf("in={ %f , %f , %f , %f } \n - ",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w)); - printf("filter={ %f , %f , %f , %f } \n - ",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w)); - printf("output={ %f , %f , %f , %f } \n - ",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w)); - }*/ - } - } -#ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); -#endif - -#ifdef RELU - output = activation(output); -#endif - write_imageh(output_image, output_pos, output); -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl deleted file mode 100644 index 96044b575e..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl +++ /dev/null @@ -1,553 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "cl_common.h" - -__kernel void conv_transpose_b(__private const int input_c_block, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height, - __read_only image2d_t input_image, - __read_only image2d_t filter, - __write_only image2d_t output_image) { - - const int out_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - const int n = in_nh / input_height; - const int h = in_nh % input_height; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 input1, input2, input3, input4; - half4 output1 = 0.0f, output2 = 0.0f, output3 = 0.0f, output4 = 0.0f; - half4 w = 0.0f; - int2 pos_in; - for (int i = 0; i < input_c_block; i += 1) { - pos_in = (int2)(mad24(i, input_width, in_w), in_nh); - input1 = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_w < 0 || h < 0 || in_w >= input_width || h >= input_height) << 15)); - input2 = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + 1, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15)); - input3 = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + 1)), - (half4)(0.0f), - (ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15)); - input4 = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + 1, pos_in.y + 1)), - (half4)(0.0f), - (ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width || h + 1 >= input_height) << 15)); - - int wx = i * 3; - int wy = out_c * 4 * 3; - w = read_imageh(filter, sampler, (int2)(wx, wy)); - output4.x += dot(input4, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy)); - output3.x += dot(input3, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy)); - output4.x += dot(input3, w); - w = read_imageh(filter, sampler, (int2)(wx, wy + 1)); - output2.x += dot(input2, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1)); - output1.x += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1)); - output2.x += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx, wy + 2)); - output4.x += dot(input2, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2)); - output3.x += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2)); - output4.x += dot(input1, w); - - wy = (out_c * 4 + 1) * 3; - w = read_imageh(filter, sampler, (int2)(wx, wy)); - output4.y += dot(input4, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy)); - output3.y += dot(input3, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy)); - output4.y += dot(input3, w); - w = read_imageh(filter, sampler, (int2)(wx, wy + 1)); - output2.y += dot(input2, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1)); - output1.y += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1)); - output2.y += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx, wy + 2)); - output4.y += dot(input2, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2)); - output3.y += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2)); - output4.y += dot(input1, w); - - wy = (out_c * 4 + 2) * 3; - w = read_imageh(filter, sampler, (int2)(wx, wy)); - output4.z += dot(input4, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy)); - output3.z += dot(input3, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy)); - output4.z += dot(input3, w); - w = read_imageh(filter, sampler, (int2)(wx, wy + 1)); - output2.z += dot(input2, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1)); - output1.z += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1)); - output2.z += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx, wy + 2)); - output4.z += dot(input2, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2)); - output3.z += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2)); - output4.z += dot(input1, w); - - wy = (out_c * 4 + 3) * 3; - w = read_imageh(filter, sampler, (int2)(wx, wy)); - output4.w += dot(input4, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy)); - output3.w += dot(input3, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy)); - output4.w += dot(input3, w); - w = read_imageh(filter, sampler, (int2)(wx, wy + 1)); - output2.w += dot(input2, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1)); - output1.w += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1)); - output2.w += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx, wy + 2)); - output4.w += dot(input2, w); - w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2)); - output3.w += dot(input1, w); - w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2)); - output4.w += dot(input1, w); - } - - int2 pos_out = (int2)(out_c * output_width + 2 * in_w, n * output_height + 2 * h); - write_imageh(output_image, pos_out, output1); - write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y), output2); - write_imageh(output_image, (int2)(pos_out.x, pos_out.y + 1), output3); - write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y + 1), output4); -} - -__kernel void depthwise_transpose(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM -__read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __private const int filter_w, - __private const int filter_h) { - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_id - int out_b_id = item_h_id / out_h; - int out_w_id_per_ch_blk = item_w_id; - int out_h_id_per_batch = item_h_id % out_h; - int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk; - - // in_id - int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride; - in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0; - int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride; - in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0; - - // filter_id - int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1; - int align_w = align_w_i % stride > 0 ? - align_w_i % stride - stride : align_w_i % stride; - int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1; - - int align_h_i = out_h_id_per_batch + pad - filter_h + 1; - int align_h = align_h_i % stride > 0 ? - align_h_i % stride - stride : align_h_i % stride; - int filter_h_id = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1; - -#ifdef BIASE_CH - half4 output; - output = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); -#elif defined(BIASE_ELE) - half4 output; - output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id)); -#else - half4 output = 0.0f; -#endif - half4 filter = 0.0f; - half4 input = 0.0f; - for (int h = filter_h_id; h >= 0; h -= stride) { - int in_h_id = select(out_b_id * in_h + in_h_id_per_batch, -1, - in_h_id_per_batch < 0 || in_h_id_per_batch >= in_h); - for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) { - int in_w_id = select(item_ch_id * in_w + in_w_id_per_ch_blk, -1, - in_w_id_per_ch_blk < 0 || in_w_id_per_ch_blk >= in_w); - int filter_w_id = item_ch_id * filter_w + w; - input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id)); - filter = read_imageh(filter_image, sampler, (int2)(filter_w_id, h)); - - output = mad(input, filter, output); - in_w_id_per_ch_blk++; - } - in_h_id_per_batch++; - } - -#ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output = mad(scale, output, biase); -#endif - -#ifdef RELU - output = activation(output); -#endif - - write_imageh(output_image, (int2)(out_w_id, item_h_id), output); -} - - -/* batch == 1 pad(output) == 1 out_w % 2 == 0 */ -__kernel void conv_transpose3x3s2(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM -__read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __private const int filter_w, - __private const int filter_h) { - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_id - int out_w_id_per_ch_blk = item_w_id / 2 * 10 + item_w_id % 2; - int out_h_id = item_h_id; - int out_w_id0 = item_ch_id * out_w + out_w_id_per_ch_blk; - int out_w_id1 = out_w_id0 + 2; - int out_w_id2 = out_w_id1 + 2; - int out_w_id3 = out_w_id2 + 2; - int out_w_id4 = out_w_id3 + 2; - - // in_id - int in_w_id_per_ch_blk = (out_w_id_per_ch_blk) / 2; - in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0; - int in_h_id_per_batch = (out_h_id) / 2; - in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0; - - // filter_id - int align_w_i = out_w_id_per_ch_blk - 1; - int align_w = align_w_i % 2 > 0 ? - align_w_i % 2 - 2 : align_w_i % 2; - int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + 1 < 3 ? out_w_id_per_ch_blk + 1 : 2 + align_w; - - int align_h_i = out_h_id - 1; - int align_h = align_h_i % 2 > 0 ? - align_h_i % 2 - 2 : align_h_i % 2; - int filter_h_id_per_out_ch = out_h_id + 1 < 3 ? out_h_id + 1 : 2 + align_h; - -#ifdef BIASE_CH - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; - -#elif defined(BIASE_ELE) - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(out_w_id0, item_h_id)); - if (out_w_id_per_ch_blk + 2 < out_w) { - output[1] = read_imageh(bias, sampler, (int2)(out_w_id1, item_h_id)); - } - if (out_w_id_per_ch_blk + 4 < out_w) { - output[2] = read_imageh(bias, sampler, (int2)(out_w_id2, item_h_id)); - } - if (out_w_id_per_ch_blk + 6 < out_w) { - output[3] = read_imageh(bias, sampler, (int2)(out_w_id3, item_h_id)); - } - if (out_w_id_per_ch_blk + 8 < out_w) { - output[4] = read_imageh(bias, sampler, (int2)(out_w_id4, item_h_id)); - } - -#else - half4 output[5] = {0.0f}; -#endif - half4 filter[4] = {0.0f}; - half4 filter_trans[4] = {0.0f}; - - half4 input[5] = {0.0f}; - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int filter_w_id = ch * 3; - int h_idx = 0; - for (int h = filter_h_id_per_out_ch; h >= 0; h -= 2) { - int in_h_id = select(in_h_id_per_batch + h_idx, -1, - in_h_id_per_batch + h_idx < 0 || in_h_id_per_batch + h_idx >= in_h); - int filter_h_id = item_ch_id * 12 + h; - int w_idx = 0; - for (int w = filter_w_id_per_ch_blk; w >= 0; w -= 2) { - int in_w_id0 = select(ch * in_w + in_w_id_per_ch_blk + w_idx, -1, - in_w_id_per_ch_blk + w_idx < 0 || in_w_id_per_ch_blk + w_idx >= in_w); - int in_w_id1 = select(ch * in_w + in_w_id_per_ch_blk + 1 + w_idx, -1, - in_w_id_per_ch_blk + 1 + w_idx < 0 || in_w_id_per_ch_blk + 1 + w_idx >= in_w); - int in_w_id2 = select(ch * in_w + in_w_id_per_ch_blk + 2 + w_idx, -1, - in_w_id_per_ch_blk + 2 + w_idx < 0 || in_w_id_per_ch_blk + 2 + w_idx >= in_w); - int in_w_id3 = select(ch * in_w + in_w_id_per_ch_blk + 3 + w_idx, -1, - in_w_id_per_ch_blk + 3 + w_idx < 0 || in_w_id_per_ch_blk + 3 + w_idx >= in_w); - int in_w_id4 = select(ch * in_w + in_w_id_per_ch_blk + 4 + w_idx, -1, - in_w_id_per_ch_blk + 4 + w_idx < 0 || in_w_id_per_ch_blk + 4 + w_idx >= in_w); - - input[0] = read_imageh(input_image, sampler, (int2)(in_w_id0, in_h_id)); - input[1] = read_imageh(input_image, sampler, (int2)(in_w_id1, in_h_id)); - input[2] = read_imageh(input_image, sampler, (int2)(in_w_id2, in_h_id)); - input[3] = read_imageh(input_image, sampler, (int2)(in_w_id3, in_h_id)); - input[4] = read_imageh(input_image, sampler, (int2)(in_w_id4, in_h_id)); - - filter[0] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id)); // in_ch:0-3,out_ch:0 - filter[1] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 3)); // in_ch:0-3,out_ch:1 - filter[2] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 6)); // in_ch:0-3,out_ch:2 - filter[3] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 9)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[0] = mad(input[0].w, filter_trans[3], output[0]); - - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - - output[4] = mad(input[4].x, filter_trans[0], output[4]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - w_idx++; - } - h_idx++; - } - } -#ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output[0] = mad(scale, output[0], biase); - if (out_w_id_per_ch_blk + 2 < out_w) { - output[1] = mad(scale, output[1], biase); - } - if (out_w_id_per_ch_blk + 4 < out_w) { - output[2] = mad(scale, output[2], biase); - } - if (out_w_id_per_ch_blk + 6 < out_w) { - output[3] = mad(scale, output[3], biase); - } - if (out_w_id_per_ch_blk + 8 < out_w) { - output[4] = mad(scale, output[4], biase); - } -#endif - -#ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); - output[2] = activation(output[2]); - output[3] = activation(output[3]); - output[4] = activation(output[4]); - -#endif - - write_imageh(output_image, (int2)(out_w_id0, item_h_id), output[0]); - - if (out_w_id_per_ch_blk + 2 < out_w) { - write_imageh(output_image, (int2)(out_w_id1, item_h_id), output[1]); - } - if (out_w_id_per_ch_blk + 4 < out_w) { - write_imageh(output_image, (int2)(out_w_id2, item_h_id), output[2]); - } - if (out_w_id_per_ch_blk + 6 < out_w) { - write_imageh(output_image, (int2)(out_w_id3, item_h_id), output[3]); - } - if (out_w_id_per_ch_blk + 8 < out_w) { - write_imageh(output_image, (int2)(out_w_id4, item_h_id), output[4]); - } -} - -__kernel void conv_transpose(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, -#endif -#ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __private const int filter_w, - __private const int filter_h) { - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_id - int out_b_id = item_h_id / out_h; - int out_w_id_per_ch_blk = item_w_id; - int out_h_id_per_batch = item_h_id % out_h; - int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk; - - // in_id - int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride; - in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0; - int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride; - in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0; - - // filter_id - int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1; - int align_w = align_w_i % stride > 0 ? - align_w_i % stride - stride : align_w_i % stride; - int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1; - - int align_h_i = out_h_id_per_batch + pad - filter_h + 1; - int align_h = align_h_i % stride > 0 ? - align_h_i % stride - stride : align_h_i % stride; - int filter_h_id_per_out_ch = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1; - -#ifdef BIASE_CH - half4 output; - output = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); -#elif defined(BIASE_ELE) - half4 output; - output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id)); -#else - half4 output = 0.0f; -#endif - half4 filter[4] = {0.0f}; - half4 filter_trans[4] = {0.0f}; - - half4 input = 0.0f; - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int filter_w_id = ch * filter_w; - int h_idx = 0; - for (int h = filter_h_id_per_out_ch; h >= 0; h -= stride) { - int in_h_id = select(in_h_id_per_batch + h_idx, -1, - in_h_id_per_batch + h_idx < 0 || in_h_id_per_batch + h_idx >= in_h); - int filter_h_id = item_ch_id * filter_h * 4 + h; - int w_idx = 0; - for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) { - int in_w_id = select(ch * in_w + in_w_id_per_ch_blk + w_idx, -1, - in_w_id_per_ch_blk + w_idx < 0 || in_w_id_per_ch_blk + w_idx >= in_w); - input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id)); - filter[0] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id)); // in_ch:0-3,out_ch:0 - filter[1] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + filter_h)); // in_ch:0-3,out_ch:1 - filter[2] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 2 * filter_h)); // in_ch:0-3,out_ch:2 - filter[3] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 3 * filter_h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 - - output = mad(input.x, filter_trans[0], output); - output = mad(input.y, filter_trans[1], output); - output = mad(input.z, filter_trans[2], output); - output = mad(input.w, filter_trans[3], output); - w_idx++; - } - h_idx++; - } - } -#ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output = mad(scale, output, biase); -#endif - -#ifdef RELU - output = activation(output); -#endif - write_imageh(output_image, (int2)(out_w_id, item_h_id), output); -} - diff --git a/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl deleted file mode 100644 index ff5daa8d01..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl +++ /dev/null @@ -1,114 +0,0 @@ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define MIN_VALUE -FLT_MAX -__kernel void density_prior_box(__write_only image2d_t output_boxes, - __write_only image2d_t output_variances, - __global float *densities, - __private const float step_h, - __private const float step_w, - __private float variances0, - __private float variances1, - __private float variances2, - __private float variances3, - __private float offset, - __private int den_and_fix_size, - __private int img_width, - __private int img_height, - __private int C, - __private int num_density, - __private int step_average, - __private int input_width, - __private int wid, - __private int fix_ratio_size - ){ - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - int2 output_pos; - output_pos.x = out_c * 4 + out_w; - output_pos.y = out_nh; - half4 output; - half4 variances; - for (int c = 0; c < 4; c++) { - int idx = out_nh % num_density; - int input_h = out_nh / num_density; - int input_w = out_c * 4 + c; - int density_idx; - int density; - int ratio_idx; - int density_i; - int density_j; - int sum = 0; - int pre_sum = 0; - for (int i = 0; i < den_and_fix_size; i++) { - pre_sum = sum; - density = densities[i]; - sum += density * density * fix_ratio_size; - if (idx < sum) { - density_idx = i; - break; - } - } - idx = idx - pre_sum; - ratio_idx = idx / (density * density); - idx = idx % (density * density); - density_i = idx / density; - density_j = idx % density; - half fixed_size = densities[den_and_fix_size + density_idx]; - half ratio = densities[2 * den_and_fix_size + ratio_idx]; - half box_width = fixed_size * ratio; - half box_height = fixed_size / ratio; - int shift = step_average / density; - half center_x; - half center_y; - center_x = (input_w + offset) * step_w; - center_x = center_x - step_average / 2.0 + shift / 2.0; - center_x = center_x + density_j * shift; - center_y = (input_h + offset) * step_h; - center_y = center_y - step_average / 2.0 + shift / 2.0; - center_y = center_y + density_i * shift; - half4 box; - box.x = (center_x - box_width / 2.0) / img_width; - box.y = (center_y - box_height / 2.0) / img_height; - box.z = (center_x + box_width / 2.0) / img_width; - box.w = (center_y + box_height / 2.0) / img_height; - box.x = max((float)box.x, 0.0); - box.y = max((float)box.y, 0.0); - box.z = min((float)box.z, 1.0); - box.w = min((float)box.w, 1.0); - half res; - half var; - if (out_w == 0) { - res = box.x; - var = convert_half(variances0); - } else if (out_w == 1) { - res = box.y; - var = convert_half(variances1); - } else if (out_w == 2) { - res = box.z; - var = convert_half(variances2); - } else if (out_w == 3) { - res = box.w; - var = convert_half(variances3); - } - variances.x = var; - variances.y = var; - variances.z = var; - variances.w = var; - if (c == 0) { - output.x = res; - } else if (c == 1) { - output.y = res; - } else if (c == 2) { - output.z = res; - } else if (c == 3) { - output.w = res; - } - } - - write_imageh(output_boxes, (int2)(output_pos.x, output_pos.y), output); - - write_imageh(output_variances, (int2)(output_pos.x, output_pos.y), variances); - -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl deleted file mode 100644 index 3c3497f917..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl +++ /dev/null @@ -1,18 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#define BIASE -#define BATCH_NORM -#define RELU -#include "conv_kernel.inc.cl" diff --git a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl deleted file mode 100644 index 2a5c823295..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl +++ /dev/null @@ -1,15 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "conv_kernel.inc.cl" diff --git a/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl deleted file mode 100644 index fc9dfc8726..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void dropout(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_W, - __private const float dropoutPro) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 input; - half4 output; - - input = read_imageh(input_image, sampler,output_pos); - half4 dropout = (half4)(1 - dropoutPro); - output = dropout * input; - - write_imageh(output_image, output_pos, output); -} - diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl deleted file mode 100644 index f304764868..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords); - half4 output = in + biase; - write_imageh(outputImage,coords,output); - } diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl deleted file mode 100644 index 916dd9d49f..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl +++ /dev/null @@ -1,150 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias, - __write_only image2d_t outputImage) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords); - half4 output = in * biase; - write_imageh(outputImage, coords, output); -} - -__kernel void channel_mul(__global image2d_t input, __global image2d_t bias, - __write_only image2d_t outputImage, int w) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - int2 coords_bias; - coords_bias.x = x / w; - coords_bias.y = 0; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords_bias); - half4 output = in * biase; - write_imageh(outputImage, coords, output); -} - -// etc : 1 1 1 72 -// run time Y [value,0,0,0] * 72 -__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias, - __write_only image2d_t outputImage, int w) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - - int2 coords_bias0; - int2 coords_bias1; - int2 coords_bias2; - int2 coords_bias3; - - /* if (x == 0 && y == 0) { - half4 b = (half4){0, 0, 0, 0}; - #define PPI(j, k) \ - b = read_imageh(bias, sampler, (int2){j, k}); \ - printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \ - convert_float(b.y), convert_float(b.z), convert_float(b.w)); - for (int i = 0; i < 73; ++i) { - PPI(i, 0); - } - #undef PPI - }*/ - - coords_bias0.x = x / w * 4; - coords_bias0.y = 0; - - coords_bias1.x = x / w * 4 + 1; - coords_bias1.y = 0; - - coords_bias2.x = x / w * 4 + 2; - coords_bias2.y = 0; - - coords_bias3.x = x / w * 4 + 3; - coords_bias3.y = 0; - - half4 biase0 = read_imageh(bias, sampler, coords_bias0); - half4 biase1 = read_imageh(bias, sampler, coords_bias1); - half4 biase2 = read_imageh(bias, sampler, coords_bias2); - half4 biase3 = read_imageh(bias, sampler, coords_bias3); - /* if (x == 0 && y == 0) { - printf("bias0={ %f , %f , %f , %f }\n ", - convert_float(biase0.x), convert_float(biase0.y), - convert_float(biase0.z), convert_float(biase0.w)); - - printf("bias1={ %f , %f , %f , %f }\n ", - convert_float(biase1.x), convert_float(biase1.y), - convert_float(biase1.z), convert_float(biase1.w)); - printf("bias2={ %f , %f , %f , %f }\n ", - convert_float(biase2.x), convert_float(biase2.y), - convert_float(biase2.z), convert_float(biase2.w)); - printf("bias3={ %f , %f , %f , %f }\n ", - convert_float(biase3.x), convert_float(biase3.y), - convert_float(biase3.z), convert_float(biase3.w)); - }*/ - half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x}; - half4 in = read_imageh(input, sampler, coords); - half4 output = mad(in, biase, 0); - write_imageh(outputImage, coords, output); -} - -// c 1 1 -__kernel void channel_mul_d3(__global image2d_t input, __global image2d_t bias, - __write_only image2d_t outputImage, int w) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - int2 coords_bias; - coords_bias.x = x / w; - coords_bias.y = 0; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords_bias); - half4 output = in * biase; - write_imageh(outputImage, coords, output); -} - -__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias, - __write_only image2d_t outputImage, int w) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - int2 coords_bias; - coords_bias.x = x / w; - coords_bias.y = 0; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords_bias); - half4 output = in * biase; - write_imageh(outputImage, coords, output); -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl deleted file mode 100644 index 1f62ff377a..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void elementwise_sub(__global image2d_t inputImage, __global image2d_t bias, __write_only image2d_t outputImage) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - half4 input = read_imageh(inputImage, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords); - half4 output = input - biase; - write_imageh(outputImage, coords, output); - } diff --git a/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl deleted file mode 100644 index 2227aaab47..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable -#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable - -__kernel void exp_impl(__read_only image2d_t input, __write_only image2d_t output) { - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(x, y)); - half4 out; - out.x = pow(2.71828182, (float)(in.x)); - out.y = pow(2.71828182, (float)(in.y)); - out.z = pow(2.71828182, (float)(in.z)); - out.w = pow(2.71828182, (float)(in.w)); - write_imageh(output, (int2)(x, y), out); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl deleted file mode 100644 index 8c74477b6a..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl +++ /dev/null @@ -1,159 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void expend_c1( - __private const int OUT_C, __private const int OUT_W, - __private const int OUT_NH, - - __private const int IN_C, __private const int IN_W, - __private const int IN_NH, - - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height, - - __read_only image2d_t input, __write_only image2d_t output, - __private const int n_times, __private const int c_times, - __private const int h_times, __private const int w_times) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) { - return; - } - - const int out_n = out_nh / output_height; - const int out_h = out_nh % output_height; - - // const real_in_c = out_c * 4 / c_times; - // const int in_c = real_in_c / 4; - const int in_c = 0; - - // const int in_c = out_c / c_times; - const int in_w = out_w / w_times; - - const int in_h = out_h / h_times; - const int in_n = out_n / n_times; - const int in_nh = in_n * input_height + in_h; - - int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh); - int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, input_pos); - in.y = in.x; - in.z = in.x; - in.w = in.x; - write_imageh(output, output_pos, in); -} - -__kernel void expend_c2( - __private const int OUT_C, __private const int OUT_W, - __private const int OUT_NH, - - __private const int IN_C, __private const int IN_W, - __private const int IN_NH, - - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height, - - __read_only image2d_t input, __write_only image2d_t output, - __private const int n_times, __private const int c_times, - __private const int h_times, __private const int w_times) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) { - return; - } - - const int out_n = out_nh / output_height; - const int out_h = out_nh % output_height; - - // const real_in_c = out_c * 4 / c_times; - // const int in_c = real_in_c / 4; - const int in_c = 0; - - // const int in_c = out_c / c_times; - const int in_w = out_w / w_times; - - const int in_h = out_h / h_times; - const int in_n = out_n / n_times; - const int in_nh = in_n * input_height + in_h; - - int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh); - int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, input_pos); - in.z = in.x; - in.w = in.y; - write_imageh(output, output_pos, in); -} - - -__kernel void expend_c4( - __private const int OUT_C, __private const int OUT_W, - __private const int OUT_NH, - - __private const int IN_C, __private const int IN_W, - __private const int IN_NH, - - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, __private const int output_height, - - __read_only image2d_t input, __write_only image2d_t output, - __private const int n_times, __private const int c_times, - __private const int h_times, __private const int w_times) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) { - return; - } - - const int out_n = out_nh / output_height; - const int out_h = out_nh % output_height; - - // const real_in_c = out_c * 4 / c_times; - // const int in_c = real_in_c / 4; - const int in_c = 0; - - // const int in_c = out_c / c_times; - const int in_w = out_w / w_times; - - const int in_h = out_h / h_times; - const int in_n = out_n / n_times; - const int in_nh = in_n * input_height + in_h; - - int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh); - int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, input_pos); - write_imageh(output, output_pos, in); -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl deleted file mode 100644 index 27ca4d296e..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void feed(__global float *in, - __write_only image2d_t output_image, - __private const int out_H, - __private const int out_W, - __private const int out_C, - __private const int Stride0, - __private const int Stride1, - __private const int Stride2){ - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh/out_H; - const int out_h = out_nh%out_H; - - const int in_n = out_n; - const int in_c0 = out_c * 4 + 0; - const int in_c1 = out_c * 4 + 1; - const int in_c2 = out_c * 4 + 2; - const int in_c3 = out_c * 4 + 3; - const int in_h = out_h; - const int in_w = out_w; - - - int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; - int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; - int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; - int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - half4 output = (half4)0.0f; - output.x = convert_half(in[input_pos0]); - if(out_C - 4 * out_c>=2){ - output.y = convert_half(in[input_pos1]); - } - if(out_C - 4 * out_c>=3){ - output.z = convert_half(in[input_pos2]); - } - if(out_C - 4 * out_c>=4){ - output.w = convert_half(in[input_pos3]); - } - write_imageh(output_image, output_pos, output); - - } - -__kernel void feed_with_pre(__global uchar *in, - __write_only image2d_t output_image, - __private const int out_H, - __private const int out_W, - __private const int out_C, - __private const int Stride0, - __private const int Stride1, - __private const int Stride2){ - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh/out_H; - const int out_h = out_nh%out_H; - - const int in_n = out_n; - const int in_c0 = out_c * 4 + 0; - const int in_c1 = out_c * 4 + 1; - const int in_c2 = out_c * 4 + 2; - const int in_c3 = out_c * 4 + 3; - const int in_h = out_h; - const int in_w = out_w; - - - int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; - int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; - int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; - int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - half4 output = (half4)0.0f; - output.x = convert_half(in[input_pos0]) / 255; - if(out_C - 4 * out_c>=2){ - output.y = convert_half(in[input_pos1]) / 255; - } - if(out_C - 4 * out_c>=3){ - output.z = convert_half(in[input_pos2]) / 255; - } - if(out_C - 4 * out_c>=4){ - output.w = convert_half(in[input_pos3]) / 255; - } - write_imageh(output_image, output_pos, output); - -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl deleted file mode 100644 index f6b8e23cc4..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void fetch(__private const int in_height, - __private const int in_width, - __read_only image2d_t input, - __global float* out, - __private const int size_ch, - __private const int size_block, - __private const int size_batch, - __private const int C) { - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - const int in_n = in_nh / in_height; - const int in_h = in_nh % in_height; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - const int pos_x = mad24(in_c, in_width, in_w); - half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh)); - - const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; - out[index] = convert_float(in.x); - if(C - 4 * in_c>=2){ - out[index + size_ch] = convert_float(in.y); - } - if(C - 4 * in_c>=3){ - out[index + size_ch * 2] = convert_float(in.z); - } - - if(C - 4 * in_c>=4){ - out[index + size_ch * 3] = convert_float(in.w); - } - -} - -__kernel void fetch_2d(__private const int in_height, - __private const int in_width, - __read_only image2d_t input, - __global float* out) { - const int in_w = get_global_id(1); - const int in_h = get_global_id(2); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(in_w, in_h)); - - const int index = (in_h * in_width + in_w) * 4; - out[index] = convert_float(in.x); - out[index + 1] = convert_float(in.y); - out[index + 2] = convert_float(in.z); - out[index + 3] = convert_float(in.w); -} - -__kernel void fetch_with_post(__private const int in_height, - __private const int in_width, - __read_only image2d_t input, - __global uchar* out, - __private const int size_ch, - __private const int size_block, - __private const int size_batch, - __private const int C) { - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - const int in_n = in_nh / in_height; - const int in_h = in_nh % in_height; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - const int pos_x = mad24(in_c, in_width, in_w); - half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh)); - - const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; - out[index] = convert_uchar_sat(in.x * 255); - if(C - 4 * in_c>=2){ - out[index + size_ch] = convert_uchar_sat(in.y * 255); - } - if(C - 4 * in_c>=3){ - out[index + size_ch * 2] = convert_uchar_sat(in.z * 255); - } - - if(C - 4 * in_c>=4){ - out[index + size_ch * 3] = convert_uchar_sat(in.w * 255); - } - -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl deleted file mode 100644 index 337fc7ae62..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl +++ /dev/null @@ -1,48 +0,0 @@ - - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - - -__kernel void flatten2(__read_only image2d_t input_img, - __write_only image2d_t output_img, - __private int out_width, - __private int in_width, - __private int in_height, - __private int in_C - ){ - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int2 output_pos; - output_pos.x = out_c * out_width + out_w; - output_pos.y = out_nh; - - int channel_size = in_width * in_height; - - int in_c = output_pos.x / channel_size / 4; - int2 input_pos; - input_pos.x = (output_pos.x % in_width) + (in_c * in_width); - input_pos.y = (output_pos.x % channel_size) / in_width + out_nh * in_height; - half4 input_data = read_imageh(input_img, sampler, input_pos); - - half4 output_data; - int in_c_offset = output_pos.x / channel_size % 4; - if(in_c_offset == 0){ - output_data.x = input_data.x; - } else if(in_c_offset == 1){ - output_data.x = input_data.y; - } else if(in_c_offset == 2){ - output_data.x = input_data.z; - } else if(in_c_offset == 3){ - output_data.x = input_data.w; - } - - write_imageh(output_img, output_pos, output_data); -} - diff --git a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl deleted file mode 100644 index 0512ce9bea..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "cl_common.h" - -__kernel void grid_sampler(__private const int out_height, - __private const int out_width, - __read_only image2d_t input, - __read_only image2d_t grid, - __write_only image2d_t output) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2) * 4; - const int out_n = out_nh / out_height; - const int out_h = out_nh % out_height; - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int x_grid = out_h / 4 * 2; - int y_grid = out_n * out_width + out_w; - float4 g1 = read_imagef(grid, sampler, (int2)(x_grid, y_grid)); - float4 g2 = read_imagef(grid, sampler, (int2)(x_grid + 1, y_grid)); - - float x = (g1.x + 1) * (out_width - 1) / 2; - float y = (g2.x + 1) * (out_height - 1) / 2; - float x0 = floor(x); - float y0 = floor(y); - int x_p = out_c * out_width + x0; - int y_p = out_n * out_height + y0; - int x_out = out_c * out_width + out_w; - int y_out = out_n * out_height + out_h; - float4 input0 = read_imagef(input, sampler, (int2)(x_p, y_p)); - float4 input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p)); - float4 input2 = read_imagef(input, sampler, (int2)(x_p, y_p + 1)); - float4 input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1)); - float4 out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) + - input1 * (x - x0) * (y0 + 1 - y) + - input2 * (x0 + 1 - x) * (y - y0) + - input3 * (x - x0) * (y - y0); - write_imageh(output, (int2)(x_out, y_out), convert_half4(out_val)); - - x = (g1.y + 1) * (out_width - 1) / 2; - y = (g2.y + 1) * (out_height - 1) / 2; - x0 = floor(x); - y0 = floor(y); - x_p = out_c * out_width + x0; - y_p = out_n * out_height + y0; - input0 = read_imagef(input, sampler, (int2)(x_p, y_p)); - input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p)); - input2 = read_imagef(input, sampler, (int2)(x_p, y_p + 1)); - input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1)); - out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) + - input1 * (x - x0) * (y0 + 1 - y) + - input2 * (x0 + 1 - x) * (y - y0) + - input3 * (x - x0) * (y - y0); - write_imageh(output, (int2)(x_out, y_out + 1), convert_half4(out_val)); - - x = (g1.z + 1) * (out_width - 1) / 2; - y = (g2.z + 1) * (out_height - 1) / 2; - x0 = floor(x); - y0 = floor(y); - x_p = out_c * out_width + x0; - y_p = out_n * out_height + y0; - input0 = read_imagef(input, sampler, (int2)(x_p, y_p)); - input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p)); - input2 = read_imagef(input, sampler, (int2)(x_p, y_p + 1)); - input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1)); - out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) + - input1 * (x - x0) * (y0 + 1 - y) + - input2 * (x0 + 1 - x) * (y - y0) + - input3 * (x - x0) * (y - y0); - write_imageh(output, (int2)(x_out, y_out + 2), convert_half4(out_val)); - - x = (g1.w + 1) * (out_width - 1) / 2; - y = (g2.w + 1) * (out_height - 1) / 2; - x0 = floor(x); - y0 = floor(y); - x_p = out_c * out_width + x0; - y_p = out_n * out_height + y0; - input0 = read_imagef(input, sampler, (int2)(x_p, y_p)); - input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p)); - input2 = read_imagef(input, sampler, (int2)(x_p, y_p + 1)); - input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1)); - out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) + - input1 * (x - x0) * (y0 + 1 - y) + - input2 * (x0 + 1 - x) * (y - y0) + - input3 * (x - x0) * (y - y0); - write_imageh(output, (int2)(x_out, y_out + 3), convert_half4(out_val)); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl deleted file mode 100644 index f78de05f76..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "cl_common.h" - -__kernel void instancenorm(__private const int in_width, - __private const int in_height, - __private const int in_c_group, - __private const int local_work_size_x, - __private const int local_work_size_y, - __private const float epsilon, - __read_only image2d_t input, - __write_only image2d_t output) { - const int out_cn = get_global_id(0); - const int n = out_cn / in_c_group; - const int c = out_cn % in_c_group; - const int w = get_local_id(1); - const int h = get_local_id(2); - const int local_id = w * local_work_size_y + h; - const int local_total_size = local_work_size_x * local_work_size_y; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -#ifdef LOCAL_MEM_128 - __local float4 shared_mem[128]; -#elif defined(LOCAL_MEM_64) - __local float4 shared_mem[64]; -#else - __local float4 shared_mem[256]; -#endif - int xOffset = c * in_width; - int yOffset = n * in_height; - float4 sum = 0.0f; - for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { - for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { - sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)); - } - } - shared_mem[local_id] = sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0.0f; - if (local_id < 32) { - for (int i = local_id + 32; i < local_total_size; i += 32) { - sum += shared_mem[i]; - } - } - shared_mem[local_id] += sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0.0f; - if (local_id == 0) { - int top = min(32, local_total_size); - for (int i = 0; i < top; i += 1) { - sum += shared_mem[i]; - } - shared_mem[0] = sum / (in_width * in_height); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - const float4 mean_val = shared_mem[0]; - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0.0f; - for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { - for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { - float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val; - sum += temp * temp; - } - } - shared_mem[local_id] = sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0.0f; - if (local_id < 32) { - for (int i = local_id + 32; i < local_total_size; i += 32) { - sum += shared_mem[i]; - } - } - shared_mem[local_id] += sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0.0f; - if (local_id == 0) { - int top = min(32, local_total_size); - for (int i = 0; i < top; i += 1) { - sum += shared_mem[i]; - } - shared_mem[0] = sum / (in_width * in_height); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon)); - - float4 s = 1 / sigma; - - for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { - for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { - int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex); - float4 in_val = read_imagef(input, sampler, intout_pos); - half4 out_val = convert_half4((in_val - mean_val) * s); -#ifdef RELU - out_val = activation(out_val); -#endif - write_imageh(output, intout_pos, out_val); - } - } -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl deleted file mode 100644 index d8c0129928..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void leakyrelu(__read_only image2d_t input, - __write_only image2d_t output, __private const float alpha, __private const int dims_w) { - const int c = get_global_id(0); - const int w = get_global_id(1); - const int nh = get_global_id(2); - int2 input_pos; - input_pos.x = c * dims_w + w; - input_pos.y = nh; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y)); - - half4 output_data; - output_data.x = max((float)(in.x), (float)(alpha * (in.x))); - output_data.y = max((float)(in.y), (float)(alpha * (in.y))); - output_data.z = max((float)(in.z), (float)(alpha * (in.z))); - output_data.w = max((float)(in.w), (float)(alpha * (in.w))); - - write_imageh(output, (int2)(input_pos.x, input_pos.y), output_data); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl deleted file mode 100644 index 080928b235..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void lrn(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_W, - __private const int n, - __private const float k, - __private const float alpha, - __private const float beta){ - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const int out_c0 = out_c * 4; - const int out_c1 = out_c * 4 + 1; - const int out_c2 = out_c * 4+ 2; - const int out_c3 = out_c * 4+ 3; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - const int start = -(n-1)/2; - const end = start + n; - float sqr_sum0 = 0.0f; - float sqr_sum1 = 0.0f; - float sqr_sum2 = 0.0f; - float sqr_sum3 = 0.0f; - int input_c0,input_c1,input_c2,input_c3; - int2 input_pos0,input_pos1,input_pos2,input_pos3; - float4 input0,input1,input2,input3; - for(int i = start; i < end ;i++){ - if(out_c0 + i>=0&&out_c0 + i=0&&out_c1 + i=0&&out_c2 + i=0&&out_c3 + i=2){ - output.y = input.y / (pow(k + alpha * (sqr_sum1),beta)); - } - if(out_C - 4 * out_c>=3){ - output.z = input.z / (pow(k + alpha * (sqr_sum2),beta)); - } - if(out_C - 4 * out_c>=4){ - output.w = input.w / (pow(k + alpha * (sqr_sum3),beta)); - } - half4 tmp = convert_half4(output); - write_imageh(output_image, output_pos, tmp); - -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl deleted file mode 100644 index b74449d9c8..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output, - __private const float scale_h, __private const float scale_w, - __private const int in_dims_h, __private const int out_dims_h, - __private const int in_dims_w, __private const int out_dims_w) { - const int c = get_global_id(0); - const int w = get_global_id(1); - const int nh = get_global_id(2); - int2 output_pos; - output_pos.x = c * out_dims_w + w; - output_pos.y = nh; - int out_n = nh / out_dims_h; - int out_h = nh % out_dims_h; - int2 input_pos; - input_pos.x = c * in_dims_w + w / scale_w; - input_pos.y = out_n * in_dims_h + out_h / scale_h; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y)); - write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl deleted file mode 100644 index 6d9142a16d..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void pad2d( - __private const int in_height, __private const int in_width, - __private const int out_height, __private const int out_width, - __private const int pad_top, __private const int pad_bottom, - __private const int pad_left, __private const int pad_right, - __private const int mode, __private const float pad_value, - __read_only image2d_t input, __write_only image2d_t output) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_height; - const int out_h = out_nh % out_height; - - int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh); - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int x = out_w - pad_left; - int y = out_h - pad_top; - - if (mode == 0) { - if (x < 0 || y < 0 || x >= in_width || y >= in_height) { - write_imageh(output, output_pos, (half4)(pad_value)); - } else { - write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y))); - } - } else if (mode == 1) { - x = abs(x); - y = abs(y); - x = x < in_width ? x : 2 * in_width - 2 - x; - y = y < in_height ? y : 2 * in_height - 2 - y; - write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y))); - } else if (mode == 2) { - x = x > 0 ? x : 0; - x = x < in_width ? x : in_width - 1; - y = y > 0 ? y : 0; - y = y < in_height ? y : in_height - 1; - write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y))); - } -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl deleted file mode 100644 index a38c1ceae0..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void pixel_shuffle(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int in_N, - __private const int in_C, - __private const int in_H, - __private const int in_W, - __private const int out_N, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int upscale_factor) { - - const int out_c4 = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int out_h = out_nh % out_H; - int out_n = out_nh / out_H; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int in_h = out_h / upscale_factor; - int in_w = out_w / upscale_factor; - int in_nh = out_n * in_H + in_h; - - half4 res; - int out_c; - int in_c; - half4 in; - int2 in_pos; - - out_c = out_c4 * 4 + 0; - in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor); - in_pos.x = (in_c / 4) * in_W + in_w; - in_pos.y = in_nh; - in = read_imageh(input_image, sampler, in_pos); - if (in_c % 4 == 0) { - res.x = in.x; - } else if (in_c % 4 == 1) { - res.x = in.y; - } else if (in_c % 4 == 2) { - res.x = in.z; - } else if (in_c % 4 == 3) { - res.x = in.w; - } - - out_c = out_c4 * 4 + 1; - in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor); - in_pos.x = (in_c / 4) * in_W + in_w; - in_pos.y = in_nh; - in = read_imageh(input_image, sampler, in_pos); - if (in_c % 4 == 0) { - res.y = in.x; - } else if (in_c % 4 == 1) { - res.y = in.y; - } else if (in_c % 4 == 2) { - res.y = in.z; - } else if (in_c % 4 == 3) { - res.y = in.w; - } - - out_c = out_c4 * 4 + 2; - in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor); - in_pos.x = (in_c / 4) * in_W + in_w; - in_pos.y = in_nh; - in = read_imageh(input_image, sampler, in_pos); - if (in_c % 4 == 0) { - res.z = in.x; - } else if (in_c % 4 == 1) { - res.z = in.y; - } else if (in_c % 4 == 2) { - res.z = in.z; - } else if (in_c % 4 == 3) { - res.z = in.w; - } - - out_c = out_c4 * 4 + 3; - in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor); - in_pos.x = (in_c / 4) * in_W + in_w; - in_pos.y = in_nh; - in = read_imageh(input_image, sampler, in_pos); - if (in_c % 4 == 0) { - res.w = in.x; - } else if (in_c % 4 == 1) { - res.w = in.y; - } else if (in_c % 4 == 2) { - res.w = in.z; - } else if (in_c % 4 == 3) { - res.w = in.w; - } - - int2 out_pos; - out_pos.x = out_c4 * out_W + out_w; - out_pos.y = out_nh; - write_imageh(output_image, out_pos, res); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl deleted file mode 100644 index fd4cc07799..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define MIN_VALUE -FLT_MAX - -__kernel void pool_max( - __private const int in_height, __private const int in_width, - __private const int out_height, __private const int out_width, - __private const int pad_top, __private const int pad_left, - __private const int stride_h, __private const int stride_w, - __private const int ksize_h, __private const int ksize_w, - __read_only image2d_t input, __write_only image2d_t output) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_height; - const int out_h = out_nh % out_height; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int start_h = out_h * stride_h - pad_top; - int end_h = min(start_h + ksize_h, in_height); - start_h = max(start_h,0); - - int start_w = out_w * stride_w - pad_left; - int end_w = min(start_w + ksize_w, in_width); - start_w = max(start_w,0); - - const int pos_in_x = out_c * in_width; - const int pos_in_y = out_n * in_height; - half4 max_value = (half4)(MIN_VALUE); - for (int y = start_h; y < end_h; ++y) { - for (int x = start_w; x < end_w; ++x) { - half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); - max_value = max(max_value, tmp); - } - } - - const int pos_out_x = mad24(out_c, out_width, out_w); - write_imageh(output, (int2)(pos_out_x, out_nh), max_value); -} - -__kernel void pool_avg( - __private const int in_height, __private const int in_width, - __private const int out_height, __private const int out_width, - __private const int pad_top, __private const int pad_left, - __private const int stride_h, __private const int stride_w, - __private const int ksize_h, __private const int ksize_w, - __read_only image2d_t input, __write_only image2d_t output) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_height; - const int out_h = out_nh % out_height; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int start_h = out_h * stride_h - pad_top; - int end_h = min(start_h + ksize_h, in_height); - start_h = max(start_h, 0); - - int start_w = out_w * stride_w - pad_left; - int end_w = min(start_w + ksize_w, in_width); - start_w = max(start_w, 0); - - const int pos_in_x = out_c * in_width; - const int pos_in_y = out_n * in_height; - half4 sum = (half4)(0.0f); - int num = 0 ; - for (int y = start_h; y < end_h; ++y) { - for (int x = start_w; x < end_w; ++x) { - sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); - } - } - - num = ksize_w * ksize_h; - half4 avg = sum / num; - - const int pos_out_x = mad24(out_c, out_width, out_w); - write_imageh(output, (int2)(pos_out_x, out_nh), avg); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl deleted file mode 100644 index edb6138919..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void pre(__global const uchar *input, - __global float *output){ - - int index = get_global_id(0); - output[index] = convert_float(input[index]) / 255; - - } diff --git a/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl deleted file mode 100644 index 886f62df68..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void prior_box(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __global float *box_width, - __global float *box_height, - __global float *variances_Buffer, - __write_only image2d_t output_boxes, - __write_only image2d_t output_variances, - __private const float step_width, - __private const float step_height, - __private const float offset, - __private const int img_width, - __private const int img_height, - __private const int num_priors, - __private const int C, - __private const int clip){ - - const int out_c = get_global_id(0); - const int out_nh = get_global_id(1); - const int out_n = out_nh/num_priors; - const int out_h = out_nh%num_priors; - - int2 output_pos; - output_pos.x = out_c * 4; - output_pos.y = out_nh; - float center_x0 = (offset + (float)(out_c * 4)) * step_width; - float center_x1 = (offset + (float)(out_c * 4 + 1)) * step_width; - float center_x2 = (offset + (float)(out_c * 4 + 2)) * step_width; - float center_x3 = (offset + (float)(out_c * 4 + 3)) * step_width; - float center_y = ((float)out_n + offset) * step_height; - - half4 output[4]; - half4 variances[4]; - output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width); - output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height); - output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width); - output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height); - variances[0].x = convert_half(variances_Buffer[0]); - variances[1].x = convert_half(variances_Buffer[1]); - variances[2].x = convert_half(variances_Buffer[2]); - variances[3].x = convert_half(variances_Buffer[3]); - - if(C - 4 * out_c>=2){ - output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width); - output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height); - output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width); - output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height); - variances[0].y = convert_half(variances_Buffer[0]); - variances[1].y = convert_half(variances_Buffer[1]); - variances[2].y = convert_half(variances_Buffer[2]); - variances[3].y = convert_half(variances_Buffer[3]); - }else{ - output[0].y = 0.0f; - output[1].y = 0.0f; - output[2].y = 0.0f; - output[3].y = 0.0f; - } - if(C - 4 * out_c>=3){ - output[0].z = convert_half((center_x2 - box_width[out_h]) / (float)img_width); - output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height); - output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width); - output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height); - variances[0].z = convert_half(variances_Buffer[0]); - variances[1].z = convert_half(variances_Buffer[1]); - variances[2].z = convert_half(variances_Buffer[2]); - variances[3].z = convert_half(variances_Buffer[3]); - }else{ - output[0].z = 0.0f; - output[1].z = 0.0f; - output[2].z = 0.0f; - output[3].z = 0.0f; - } - if(C - 4 * out_c>=4){ - output[0].w = convert_half((center_x3 - box_width[out_h]) / (float)img_width); - output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height); - output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width); - output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height); - variances[0].w = convert_half(variances_Buffer[0]); - variances[1].w = convert_half(variances_Buffer[1]); - variances[2].w = convert_half(variances_Buffer[2]); - variances[3].w = convert_half(variances_Buffer[3]); - }else{ - output[0].w = 0.0f; - output[1].w = 0.0f; - output[2].w = 0.0f; - output[3].w = 0.0f; - } - if(clip==1){ - output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); - output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); - output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); - output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); - } - /* - if(output_pos.x == 0 && output_pos.y == 1){ - float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x); - printf("output = %v4hlf \n", out); - - } - */ - - write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]); - write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]); - write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]); - write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]); - - write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]); - write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]); - write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]); - write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]); - - -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/relu.cl b/mobile/src/operators/kernel/cl/cl_kernel/relu.cl deleted file mode 100644 index cc8f9c3742..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/relu.cl +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void relu(__read_only image2d_t input, - __write_only image2d_t output){ - - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(x, y)); - in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in); - write_imageh(output, (int2)(x, y), in); -} - -__kernel void relu_p0(__read_only image2d_t input, - __write_only image2d_t output){ - - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(x, y)); - in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in); - write_imageh(output, (int2)(x, y), in); -} -__kernel void relu_p1(__read_only image2d_t input, - __write_only image2d_t output){ - - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(x, y)); - write_imageh(output, (int2)(x, y), in); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl b/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl deleted file mode 100644 index 7a2f0e022f..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void relu6(__read_only image2d_t input, - __write_only image2d_t output, - __private const float threshold){ - - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(x, y)); - in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in); - in = min((half4)(threshold, threshold, threshold, threshold), in); - write_imageh(output, (int2)(x, y), in); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl b/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl deleted file mode 100644 index 7957001c96..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void reshape(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H, - __private const int in_Stride0, - __private const int in_Stride1, - __private const int in_Stride2, - __private const int out_Stride0, - __private const int out_Stride1, - __private const int out_Stride2) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh/out_H; - const int out_h = out_nh%out_H; - const int out_c0 = out_c * 4; - const int out_c1 = out_c * 4 + 1; - const int out_c2 = out_c * 4+ 2; - const int out_c3 = out_c * 4+ 3; - - int count0 = out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w; - int count1 = out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w; - int count2 = out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w; - int count3 = out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w; - - int in_n0 = count0/in_Stride2; - int in_n1 = count1/in_Stride2; - int in_n2 = count1/in_Stride2; - int in_n3 = count2/in_Stride2; - - count0 = count0%in_Stride2; - count1 = count1%in_Stride2; - count2 = count2%in_Stride2; - count3 = count3%in_Stride2; - - int in_c0 = count0/in_Stride1; - int in_c1 = count1/in_Stride1; - int in_c2 = count2/in_Stride1; - int in_c3 = count3/in_Stride1; - - int in_h0 = (count0%in_Stride1)/in_Stride0; - int in_h1 = (count1%in_Stride1)/in_Stride0; - int in_h2 = (count2%in_Stride1)/in_Stride0; - int in_h3 = (count3%in_Stride1)/in_Stride0; - - int in_w0 = (count0%in_Stride1)%in_Stride0; - int in_w1 = (count1%in_Stride1)%in_Stride0; - int in_w2 = (count2%in_Stride1)%in_Stride0; - int in_w3 = (count3%in_Stride1)%in_Stride0; - - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - - input_pos0.x = (in_c0/4) * in_W + in_w0; - input_pos0.y = in_n0 * in_H + in_h0; - - input_pos1.x = (in_c1/4) * in_W + in_w1; - input_pos1.y = in_n1 * in_H + in_h1; - - input_pos2.x = (in_c2/4) * in_W + in_w2; - input_pos2.y = in_n2 * in_H + in_h2; - - input_pos3.x = (in_c3/4) * in_W + in_w3; - input_pos3.y = in_n3 * in_H + in_h3; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 input0; - half4 input1; - half4 input2; - half4 input3; - half4 output; - - input0 = read_imageh(input_image, sampler,input_pos0); - if(in_c0%4==0){ - output.x = input0.x; - }else if(in_c0%4==1){ - output.x = input0.y; - }else if(in_c0%4==2){ - output.x = input0.z; - }else{ - output.x = input0.w; - } - if(out_C - out_c * 4>=2){ - input1 = read_imageh(input_image, sampler,input_pos1); - if(in_c1%4==0){ - output.y = input1.x; - }else if(in_c1%4==1){ - output.y = input1.y; - }else if(in_c1%4==2){ - output.y = input1.z; - }else{ - output.y = input1.w; - } - - }else{ - output.y = 0.0f; - } - - if(out_C - out_c * 4>=3){ - input2 = read_imageh(input_image, sampler,input_pos2); - - if(in_c2%4==0){ - output.z = input2.x; - }else if(in_c2%4==1){ - output.z = input1.y; - }else if(in_c2%4==2){ - output.z = input2.z; - }else{ - output.z = input2.w; - } - }else{ - output.z = 0.0f; - } - - if(out_C - out_c * 4>=4){ - input3 = read_imageh(input_image, sampler,input_pos3); - if(in_c3%4==0){ - output.w = input3.x; - }else if(in_c3%4==1){ - output.w = input3.y; - }else if(in_c3%4==2){ - output.w = input3.z; - }else{ - output.w = input3.w; - } - }else{ - output.w = 0.0f; - } - - write_imageh(output_image, output_pos, output); -} - - -/* - -__kernel void reshape(__read_only image2d_t input, - __write_only image2d_t output, - __private const int d0, - __private const int d1, - __private const int d2, - __private const int d3, - __private const int x0, - __private const int x1, - __private const int x2, - __private const int x3) { - const int x = get_global_id(0); - const int y = get_global_id(1); - int obx = x / x3; - int oby = y / x2; - int ox = x % x3; - int oy = y % x2; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 r; - for (int i = 0; i < 4; i++) { - int t = obx * 4 + i; - if (t > x1) break; - int oindex = oby * x1 * x2 * x3 + t * x2 * x3 + ox * x3 + oy; - int i3 = oindex % d3; oindex /= d3; - int i2 = oindex % d2; oindex /= d2; - int i1 = oindex % d1; oindex /= d1; - int i0 = oindex; - int ix = (i1 / 4) * d3 + i3; - int iy = i0 * d2 + i2; - half4 p = read_imageh(input, sampler, (int2)(ix, iy)); - ((half*)&r)[i] = ((half*)&p)[i1%4]; - } - write_imageh(output, (int2)(x, y), r); -} - -*/ diff --git a/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl deleted file mode 100644 index 57d775b22b..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void scale(__read_only image2d_t input, - __write_only image2d_t output, - __private float scale, - __private float bias, - __private int out_width){ - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int pos_x = mad24(out_c, out_width, out_w); - half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh)); - in = convert_half(scale) * in + convert_half(bias); - write_imageh(output, (int2)(pos_x, out_nh), in); -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl b/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl deleted file mode 100644 index 0a1995d42c..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void sigmoid(__read_only image2d_t input, - __write_only image2d_t output){ - - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(x, y)); - half4 out; - out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x))); - out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y))); - out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z))); - out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w))); - write_imageh(output, (int2)(x, y), out); -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl deleted file mode 100644 index aab8357d82..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void slice(__read_only image2d_t input, __write_only image2d_t output, - __private const int start, __private const int end, - __private const int dims_w){ - - const int c = get_global_id(0); - const int w = get_global_id(1); - const int nh = get_global_id(2); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int2 output_pos; - output_pos.x = c * dims_w + w; - output_pos.y = nh; - - int2 input_pos; - half4 input_data; - half4 output_data; - - if (start % 4 == 0) { - input_pos.x = (4 * c + start) / 4 * dims_w + w; - input_pos.y = nh; - input_data = read_imageh(input, sampler,input_pos); - output_data = input_data; - } else if (start % 4 == 1) { - input_pos.x = (4 * c + start) / 4 * dims_w + w; - input_pos.y = nh; - input_data = read_imageh(input, sampler,input_pos); - output_data.x = input_data.y; - output_data.y = input_data.z; - output_data.z = input_data.w; - input_pos.x = input_pos.x + dims_w; - input_pos.y = nh; - input_data = read_imageh(input, sampler,input_pos); - output_data.w = input_data.x; - } else if (start % 4 == 2) { - input_pos.x = (4 * c + start) / 4 * dims_w + w; - input_pos.y = nh; - input_data = read_imageh(input, sampler,input_pos); - output_data.x = input_data.z; - output_data.y = input_data.w; - input_pos.x = input_pos.x + dims_w; - input_pos.y = nh; - input_data = read_imageh(input, sampler,input_pos); - output_data.z = input_data.x; - output_data.w = input_data.y; - } else if (start % 4 == 3) { - input_pos.x = (4 * c + start) / 4 * dims_w + w; - input_pos.y = nh; - input_data = read_imageh(input, sampler,input_pos); - output_data.x = input_data.w; - input_pos.x = input_pos.x + dims_w; - input_pos.y = nh; - input_data = read_imageh(input, sampler,input_pos); - output_data.y = input_data.x; - output_data.z = input_data.y; - output_data.w = input_data.z; - } - write_imageh(output, output_pos, output_data); - -} diff --git a/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl b/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl deleted file mode 100644 index a1fa014e00..0000000000 --- a/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -__kernel void softmax(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_W - ) { - const int out_c = get_global_id(0); // block index - const int out_w = get_global_id(1); // index in one block - const int out_nh = get_global_id(2); - - const int in_c = out_c; - const int in_w = out_w; - const int in_nh = out_nh; - - int2 input_pos; - int2 output_pos; - - input_pos.x = in_c * out_W + in_w; - input_pos.y = in_nh; - - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 input_max = 0.0f; - half4 input_tmp; - for(int i=0;i=2){ - input1 = read_imageh(input_image, sampler,input_pos1); - if(out_w%4==0){ - output.y = input1.x; - }else if(out_w%4==1){ - output.y = input1.y; - }else if(out_w%4==2){ - output.y = input1.z; - }else{ - output.y = input1.w; - } - - }else{ - output.y = 0.0f; - } - - if(out_C - out_c * 4>=3){ - input2 = read_imageh(input_image, sampler,input_pos2); - - if(out_w%4==0){ - output.z = input2.x; - }else if(out_w%4==1){ - output.z = input2.y; - }else if(out_w%4==2){ - output.z = input2.z; - }else{ - output.z = input2.w; - } - }else{ - output.z = 0.0f; - } - - if(out_C - out_c * 4>=4){ - input3 = read_imageh(input_image, sampler,input_pos3); - if(out_w%4==0){ - output.w = input3.x; - }else if(out_w%4==1){ - output.w = input3.y; - }else if(out_w%4==2){ - output.w = input3.z; - }else{ - output.w = input3.w; - } - }else{ - output.w = 0.0f; - } - write_imageh(output_image, output_pos, output); -} - -__kernel void transpose( __read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W - ){ - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = 1; - const int out_h = out_nh%out_H; - - const int in_n = 1; - const int in_c = out_c; - const int in_w = out_h; - const int in_h = out_w; - - int2 input_pos; - int2 output_pos; - - input_pos.x = in_c * in_W + in_w; - input_pos.y = in_n * in_h; - - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_n * out_h; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 input; - half4 output; - input = read_imageh(input_image, sampler,input_pos); - - output = input; - write_imageh(output_image, output_pos, output); - -} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/concat_kernel.cpp b/mobile/src/operators/kernel/cl/concat_kernel.cpp deleted file mode 100644 index 013faa3fd1..0000000000 --- a/mobile/src/operators/kernel/cl/concat_kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONCAT_OP - -#include "operators/kernel/concat_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConcatKernel::Init(ConcatParam *param) { - if (param->Out()->dims().size() < 4) { - if (param->Out()->dims().size() - param->axis_ == 1) { - this->cl_helper_.AddKernel("concatByW", "concat_kernel.cl"); - } else { - this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl"); - } - } else if (param->Out()->dims().size() >= 4) { - if (param->Inputs().size() == 2) { - this->cl_helper_.AddKernel("concatByCWith2Inputs", "concat_kernel.cl"); - } else if (param->Inputs().size() == 3) { - this->cl_helper_.AddKernel("concatByCWith3Inputs", "concat_kernel.cl"); - } else if (param->Inputs().size() == 4) { - this->cl_helper_.AddKernel("concatByCWith4Inputs", "concat_kernel.cl"); - } else { - return false; - } - } - return true; -} - -template <> -void ConcatKernel::Compute(const ConcatParam ¶m) { - if (param.Out()->dims().size() < 4) { - auto kernel = this->cl_helper_.KernelAt(0); - auto inputs = param.Inputs(); - auto *output_image = param.Out()->GetCLImage(); - int out_W = 0; - if (param.Out()->dims().size() == 3) { - out_W = param.Out()->dims()[2]; - } else if (param.Out()->dims().size() == 2) { - out_W = param.Out()->dims()[1]; - } - int out_H_Start = 0; - if (param.Out()->dims().size() - param.axis_ == 1) { - for (int i = 0; i < inputs.size(); i++) { - int pre_Width = 0; - for (int k = 0; k < i; ++k) { - pre_Width += inputs[k]->dims()[inputs[k]->dims().size() - 1]; - } - int in_w = inputs[i]->dims()[param.Out()->dims().size() - 2]; - auto input_image = inputs[i]->GetCLImage(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]); - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &in_w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &pre_Width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } - - } else { - for (int i = 0; i < inputs.size(); i++) { - auto input_image = inputs[i]->GetCLImage(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]); - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &out_H_Start); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - if (param.Out()->dims().size() == 3) { - out_H_Start += inputs[i]->dims()[1]; - } else if (param.Out()->dims().size() == 2) { - out_H_Start += inputs[i]->dims()[0]; - } - } - } - - } else { - auto kernel0 = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - auto inputs = param.Inputs(); - int arg_offset; - cl_int status; - if (inputs.size() == 2) { - auto input_image_0 = inputs[0]->GetCLImage(); - status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0); - CL_CHECK_ERRORS(status); - auto input_image_1 = inputs[1]->GetCLImage(); - status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1); - CL_CHECK_ERRORS(status); - int C_0 = inputs[0]->dims()[1]; - status = clSetKernelArg(kernel0, 2, sizeof(int), &C_0); - CL_CHECK_ERRORS(status); - int C_1 = inputs[1]->dims()[1]; - status = clSetKernelArg(kernel0, 3, sizeof(int), &C_1); - CL_CHECK_ERRORS(status); - arg_offset = 4; - } else if (inputs.size() == 3) { - auto input_image_0 = inputs[0]->GetCLImage(); - status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0); - CL_CHECK_ERRORS(status); - auto input_image_1 = inputs[1]->GetCLImage(); - status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1); - CL_CHECK_ERRORS(status); - auto input_image_2 = inputs[2]->GetCLImage(); - status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2); - CL_CHECK_ERRORS(status); - int C_0 = inputs[0]->dims()[1]; - status = clSetKernelArg(kernel0, 3, sizeof(int), &C_0); - CL_CHECK_ERRORS(status); - int C_1 = inputs[1]->dims()[1]; - status = clSetKernelArg(kernel0, 4, sizeof(int), &C_1); - CL_CHECK_ERRORS(status); - int C_2 = inputs[2]->dims()[1]; - status = clSetKernelArg(kernel0, 5, sizeof(int), &C_2); - CL_CHECK_ERRORS(status); - arg_offset = 6; - } else if (inputs.size() == 4) { - auto input_image_0 = inputs[0]->GetCLImage(); - status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0); - CL_CHECK_ERRORS(status); - auto input_image_1 = inputs[1]->GetCLImage(); - status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1); - CL_CHECK_ERRORS(status); - auto input_image_2 = inputs[2]->GetCLImage(); - status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2); - CL_CHECK_ERRORS(status); - auto input_image_3 = inputs[3]->GetCLImage(); - status = clSetKernelArg(kernel0, 3, sizeof(cl_mem), &input_image_3); - CL_CHECK_ERRORS(status); - int C_0 = inputs[0]->dims()[1]; - status = clSetKernelArg(kernel0, 4, sizeof(int), &C_0); - CL_CHECK_ERRORS(status); - int C_1 = inputs[1]->dims()[1]; - status = clSetKernelArg(kernel0, 5, sizeof(int), &C_1); - CL_CHECK_ERRORS(status); - int C_2 = inputs[2]->dims()[1]; - status = clSetKernelArg(kernel0, 6, sizeof(int), &C_2); - CL_CHECK_ERRORS(status); - int C_3 = inputs[3]->dims()[1]; - status = clSetKernelArg(kernel0, 7, sizeof(int), &C_3); - CL_CHECK_ERRORS(status); - arg_offset = 8; - } - auto *output_image = param.Out()->GetCLImage(); - status = - clSetKernelArg(kernel0, arg_offset + 0, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - int out_C = param.Out()->dims()[1]; - status = clSetKernelArg(kernel0, arg_offset + 1, sizeof(int), &out_C); - CL_CHECK_ERRORS(status); - int out_W = param.Out()->dims()[3]; - status = clSetKernelArg(kernel0, arg_offset + 2, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp deleted file mode 100644 index 758f60b4fb..0000000000 --- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ /dev/null @@ -1,271 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBNRELU_OP - -#include "operators/kernel/conv_add_bn_relu_kernel.h" - -#include - -#include "framework/cl/cl_image.h" -#include "framework/cl/cl_tool.h" -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { -namespace operators { -template <> -bool ConvAddBNReluKernel::Init( - FusionConvAddBNReluParam *param) { - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - - if (!param->Bias()->isInit()) { - param->Bias()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - } - - // const CL *mean = param->InputMean(); - const framework::CLImage *mean = param->InputMean(); - const framework::CLImage *variance = param->InputVariance(); - const framework::CLImage *scale = param->InputScale(); - const framework::CLImage *bias = param->InputBias(); - const float epsilon = param->Epsilon(); - - const int C = mean->numel(); - - // for (int j = 0; j < C; ++j) { - // DLOG << " mean - " << j << mean->data()[j]; - // } - // - // for (int j = 0; j < C; ++j) { - // DLOG << " variance - " << j << variance->data()[j]; - // } - // - // for (int j = 0; j < C; ++j) { - // DLOG << " scale - " << j << scale->data()[j]; - // } - // - // for (int j = 0; j < C; ++j) { - // DLOG << " bias - " << j << bias->data()[j]; - // } - - // - // DLOG << " climage mean: " << *mean; - // DLOG << " climage variance: " << *variance; - // DLOG << " climage scale: " << *scale; - // DLOG << " climage bias: " << *bias; - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - - float inv_std_ptr[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - float *new_scale_ptr = new float[C]; - float *new_bias_ptr = new float[C]; - - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; - } - - framework::CLImage *new_scale = new framework::CLImage(); - - // for (int j = 0; j < C; ++j) { - // DLOG << " new scale - " << j << new_scale_ptr[j]; - // } - // - // for (int j = 0; j < C; ++j) { - // DLOG << " new bias - " << j << new_bias_ptr[j]; - // } - - new_scale->SetTensorData(new_scale_ptr, variance->dims()); - new_scale->InitCLImage(this->cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - // DLOG << " climage - y bias: " << *(param->Bias()); - // - // DLOG << " climage - new scale: " << *new_scale; - - framework::CLImage *new_bias = new framework::CLImage(); - - new_bias->SetTensorData(new_bias_ptr, variance->dims()); - new_bias->InitCLImage(this->cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - // DLOG << " climage - new bias: " << *new_bias; - // - // DLOG << " climage - filter: " << *(param->Filter()); - - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); - - delete[](new_scale_ptr); - delete[](new_bias_ptr); - - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - - param->SetOffset(offset); - - const std::string conv_kernel_file = "conv_kernel.cl"; - const std::string wino_kernel_file = "winograd_transform.cl"; - std::string build_options = "-DBATCH_NORM -DRELU"; - if (param->Output()->dims() == param->Bias()->dims()) { - build_options += " -DBIASE_ELE"; - } else { - build_options += " -DBIASE_CH"; - } - - /* - if (param->Filter()->dims()[2] == 1 && - param->Filter()->dims()[3] == 1 && - (param->Filter()->dims()[0] % 16) == 0) { - param->Filter()->InitNImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl"); - DLOG << " conv add bn relu conv 1x1 4"; - } - */ - if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT; - param->Filter()->InitNImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - if (param->Input()->dims()[1] % 4 == 0) { - this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, - build_options); - } else { - this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, - build_options); - } - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file, - build_options); - } else { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file, - build_options); - } - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] != 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - // other depthwise not with filter 3x3 - DLOG << "depth_conv basic "; - param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; - this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); - - } else if (param->Filter()->dims()[2] == 3 && - param->Filter()->dims()[3] == 3) { - // if (param->Strides()[0] == param->Strides()[1] && - // param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) { - // param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; - // this->cl_helper_.AddKernel("winograd_filter_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("winograd_input_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("matmul", "matmul.cl"); - // this->cl_helper_.AddKernel("winograd_output_transform_2x2", - // wino_kernel_file, build_options); - // - // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); - // - // } else { - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - // std::cout << " input dim " << param->Input()->dims()[0] << " " - // << param->Input()->dims()[1] << " " << - // param->Input()->dims()[2] - // << " " << param->Input()->dims()[3] << " " << std::endl; - // std::cout << " output dim " << param->Output()->dims()[0] << " " - // << param->Output()->dims()[1] << " " << - // param->Output()->dims()[2] - // << " " << param->Output()->dims()[3] << " " << std::endl; - // std::cout << " filter dim " << param->Filter()->dims()[0] << " " - // << param->Filter()->dims()[1] << " " << - // param->Filter()->dims()[2] - // << " " << param->Filter()->dims()[3] << " " << std::endl; - - if (param->groups > 1) { - param->ExecMode() = - ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT; - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); - } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, - build_options); - } - // } - } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); - } - - return true; -} - -template <> -void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(), - param.NewScale(), param.NewBias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: - case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: - ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(), - param.NewScale(), param.NewBias()); - break; - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(), - param.NewScale(), param.NewBias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: - SWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(), - param.NewScale(), param.NewBias()); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } -} - -template class ConvAddBNReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp deleted file mode 100644 index 5f21d3dd3e..0000000000 --- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP - -#include "operators/kernel/conv_add_kernel.h" -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddKernel::Init(FusionConvAddParam *param) { - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - if (!param->Bias()->isInit()) { - param->Bias()->InitCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - } - - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - param->SetOffset(offset); - - const std::string conv_kernel_file = "conv_kernel.cl"; - const std::string wino_kernel_file = "winograd_transform.cl"; - std::string build_options; - if (param->Output()->dims() == param->Bias()->dims()) { - build_options = "-DBIASE_ELE"; - } else { - build_options = "-DBIASE_CH"; - } - - if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT; - param->Filter()->InitNImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - if (param->Input()->dims()[1] % 4 == 0) { - this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, - build_options); - } else { - this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, - build_options); - } - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file, - build_options); - } else { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file, - build_options); - } - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] != 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; - this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); - - } else if (param->Filter()->dims()[2] == 3 && - param->Filter()->dims()[3] == 3) { - // if (param->Strides()[0] == param->Strides()[1] && - // param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) { - // param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; - // this->cl_helper_.AddKernel("winograd_filter_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("winograd_input_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("matmul", "matmul.cl"); - // this->cl_helper_.AddKernel("winograd_output_transform_2x2", - // wino_kernel_file, build_options); - // - // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); - // - // } else { - - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - if (param->groups > 1) { - param->ExecMode() = - ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT; - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); - } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, - build_options); - } - // } - - } else if (param->Filter()->dims()[2] == 7 && - param->Filter()->dims()[3] == 7) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_7x7spl", conv_kernel_file, build_options); - - } else if (param->Filter()->dims()[2] == 5 && - param->Filter()->dims()[3] == 5) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options); - } - - return true; -} - -template <> -void ConvAddKernel::Compute( - const FusionConvAddParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: - case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: - ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: - SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); - break; - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: - SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } -} - -template class ConvAddKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp deleted file mode 100644 index 16281e5cb7..0000000000 --- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDRELU_OP - -#include "operators/kernel/conv_add_relu_kernel.h" -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddReluKernel::Init( - FusionConvAddReluParam *param) { - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - if (!param->Bias()->isInit()) { - param->Bias()->InitCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - } - - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - param->SetOffset(offset); - - const std::string conv_kernel_file = "conv_kernel.cl"; - const std::string wino_kernel_file = "winograd_transform.cl"; - std::string build_options = "-DRELU"; - if (param->Output()->dims() == param->Bias()->dims()) { - build_options += " -DBIASE_ELE"; - } else { - build_options += " -DBIASE_CH"; - } - - if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT; - param->Filter()->InitNImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - if (param->Input()->dims()[1] % 4 == 0) { - this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, - build_options); - } else { - this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, - build_options); - } - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file, - build_options); - } else { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file, - build_options); - } - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] != 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - DLOG << "init depwise conv basic"; - param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; - this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); - } else if (param->Filter()->dims()[2] == 3 && - param->Filter()->dims()[3] == 3) { - // if (param->Strides()[0] == param->Strides()[1] && - // param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) { - // param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; - // this->cl_helper_.AddKernel("winograd_filter_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("winograd_input_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("matmul", "matmul.cl"); - // this->cl_helper_.AddKernel("winograd_output_transform_2x2", - // wino_kernel_file, build_options); - // - // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); - // - // } else { - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - // std::cout << " input dim " << param->Input()->dims()[0] << " " - // << param->Input()->dims()[1] << " " << - // param->Input()->dims()[2] - // << " " << param->Input()->dims()[3] << " " << std::endl; - // std::cout << " output dim " << param->Output()->dims()[0] << " " - // << param->Output()->dims()[1] << " " << - // param->Output()->dims()[2] - // << " " << param->Output()->dims()[3] << " " << std::endl; - // std::cout << " filter dim " << param->Filter()->dims()[0] << " " - // << param->Filter()->dims()[1] << " " << - // param->Filter()->dims()[2] - // << " " << param->Filter()->dims()[3] << " " << std::endl; - - if (param->groups > 1) { - param->ExecMode() = - ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT; - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); - } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, - build_options); - } - - // } - - } else if (param->Filter()->dims()[2] == 7 && - param->Filter()->dims()[3] == 7) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options); - - } else if (param->Filter()->dims()[2] == 5 && - param->Filter()->dims()[3] == 5) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options); - - } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); - } - - return true; -} - -template <> -void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: - case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: - ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias()); - break; - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: - SWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias()); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } -} - -template class ConvAddReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp deleted file mode 100644 index 7e8a44ced0..0000000000 --- a/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNADDRELU_OP - -#include "operators/kernel/conv_bn_add_relu_kernel.h" -#include -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvBNAddReluKernel::Init( - FusionConvBNAddReluParam *param) { - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - - const framework::CLImage *mean = param->InputMean(); - const framework::CLImage *variance = param->InputVariance(); - const framework::CLImage *scale = param->InputScale(); - const framework::CLImage *bias = param->InputBias(); - - const float epsilon = param->Epsilon(); - - const int C = mean->numel(); - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - - float inv_std_ptr[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - float *new_scale_ptr = new float[C]; - float *new_bias_ptr = new float[C]; - - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; - } - - framework::CLImage *new_scale = new framework::CLImage(); - - // for (int j = 0; j < C; ++j) { - // DLOG << " new scale - " << j << new_scale_ptr[j]; - // } - // - // for (int j = 0; j < C; ++j) { - // DLOG << " new bias - " << j << new_bias_ptr[j]; - // } - - new_scale->SetTensorData(new_scale_ptr, variance->dims()); - new_scale->InitCLImage(this->cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - // DLOG << " climage - y bias: " << *(param->Bias()); - // - // DLOG << " climage - new scale: " << *new_scale; - - framework::CLImage *new_bias = new framework::CLImage(); - - new_bias->SetTensorData(new_bias_ptr, variance->dims()); - new_bias->InitCLImage(this->cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - // DLOG << " climage - new bias: " << *new_bias; - // - // DLOG << " climage - filter: " << *(param->Filter()); - - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); - - delete[](new_scale_ptr); - delete[](new_bias_ptr); - - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - - param->SetOffset(offset); - - const std::string conv_kernel_file = "conv_kernel.cl"; - const std::string wino_kernel_file = "winograd_transform.cl"; - std::string build_options = "-DBATCH_NORM -DRELU"; - if (param->Output()->dims() == param->Bias()->dims()) { - build_options += " -DBIASE_ELE"; - } else { - build_options += " -DBIASE_CH"; - } - - if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT; - param->Filter()->InitNImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("convBNAdd_1x1_spl", conv_kernel_file, - build_options); - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == 3) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("depth_convBNAdd_3x3", conv_kernel_file, - build_options); - - } else if (param->Filter()->dims()[2] == 3 && - param->Filter()->dims()[3] == 3) { - // if (param->Strides()[0] == param->Strides()[1] && - // param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) { - // param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; - // this->cl_helper_.AddKernel("winograd_filter_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("winograd_input_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("matmul", "matmul.cl"); - // this->cl_helper_.AddKernel("winograd_output_transform_2x2_bn_add", - // wino_kernel_file, build_options); - // - // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); - // - // } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("convBNAdd_3x3", conv_kernel_file, - build_options); - // } - } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); - } - - return true; -} - -template <> -void ConvBNAddReluKernel::Compute( - const FusionConvBNAddReluParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(), - param.NewScale(), param.NewBias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: - ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(), - param.NewScale(), param.NewBias()); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } -} -template class ConvBNAddReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp deleted file mode 100644 index bd8b71b85d..0000000000 --- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNRELU_OP - -#include "operators/kernel/conv_bn_relu_kernel.h" -#include -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvBNReluKernel::Init( - FusionConvBNReluParam *param) { - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - const framework::CLImage *mean = param->InputMean(); - const framework::CLImage *variance = param->InputVariance(); - const framework::CLImage *scale = param->InputScale(); - const framework::CLImage *bias = param->InputBias(); - const float epsilon = param->Epsilon(); - - const int C = mean->numel(); - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - - float inv_std_ptr[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - float *new_scale_ptr = new float[C]; - float *new_bias_ptr = new float[C]; - - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; - } - - framework::CLImage *new_scale = new framework::CLImage(); - - // for (int j = 0; j < C; ++j) { - // DLOG << " new scale - " << j << new_scale_ptr[j]; - // } - // - // for (int j = 0; j < C; ++j) { - // DLOG << " new bias - " << j << new_bias_ptr[j]; - // } - - new_scale->SetTensorData(new_scale_ptr, variance->dims()); - new_scale->InitCLImage(this->cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - // DLOG << " climage - y bias: " << *(param->Bias()); - // - // DLOG << " climage - new scale: " << *new_scale; - - framework::CLImage *new_bias = new framework::CLImage(); - - new_bias->SetTensorData(new_bias_ptr, variance->dims()); - new_bias->InitCLImage(this->cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - // DLOG << " climage - new bias: " << *new_bias; - // - // DLOG << " climage - filter: " << *(param->Filter()); - - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); - - delete[](new_scale_ptr); - delete[](new_bias_ptr); - - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - - param->SetOffset(offset); - - const std::string conv_kernel_file = "conv_kernel.cl"; - const std::string wino_kernel_file = "winograd_transform.cl"; - const std::string build_options = "-DBATCH_NORM -DRELU"; - - if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT; - param->Filter()->InitNImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - if (param->Input()->dims()[1] % 4 == 0) { - this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, - build_options); - } else { - this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, - build_options); - } - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file, - build_options); - } else { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file, - build_options); - } - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] != 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; - this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); - } else if (param->Filter()->dims()[2] == 3 && - param->Filter()->dims()[3] == 3) { - // if (param->Strides()[0] == param->Strides()[1] && - // param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) { - // param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; - // this->cl_helper_.AddKernel("winograd_filter_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("winograd_input_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("matmul", "matmul.cl"); - // this->cl_helper_.AddKernel("winograd_output_transform_2x2", - // wino_kernel_file, build_options); - // - // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); - // - // } else { - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - if (param->groups > 1) { - param->ExecMode() = - ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT; - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); - } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, - build_options); - } - // } - } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); - } - return true; -} - -template <> -void ConvBNReluKernel::Compute( - const FusionConvBNReluParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, nullptr, - param.NewScale(), param.NewBias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: - case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: - ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(), - param.NewBias()); - break; - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - DWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(), - param.NewBias()); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: - SWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(), - param.NewBias()); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } -} -template class ConvBNReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp deleted file mode 100644 index 054eab85ab..0000000000 --- a/mobile/src/operators/kernel/cl/conv_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#include "operators/kernel/conv_kernel.h" -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvKernel::Init(ConvParam *param) { - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - param->SetOffset(offset); - - DLOG << " init helper: " << &cl_helper_; - DLOG << " conv kernel add kernel ~ "; - DLOG << " width of one block: " << param->Filter()->dims()[3]; - DLOG << " height of one block: " << param->Filter()->dims()[2]; - DLOG << " filter dims: " << param->Filter()->dims(); - - const std::string conv_kernel_file = "conv_kernel.cl"; - const std::string wino_kernel_file = "winograd_transform.cl"; - - if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT; - param->Filter()->InitNImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - if (param->Input()->dims()[1] % 4 == 0) { - this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file); - } else { - this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file); - } - DLOG << "conv 1x1"; - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file); - } else { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file); - } - DLOG << "depth_conv 3x3"; - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] != 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; - this->cl_helper_.AddKernel("depth_conv", conv_kernel_file); - } else if (param->Filter()->dims()[2] == 3 && - param->Filter()->dims()[3] == 3) { - // if (param->Strides()[0] == param->Strides()[1] && - // param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) { - // param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; - // this->cl_helper_.AddKernel("winograd_filter_transform_2x2", - // wino_kernel_file); - // this->cl_helper_.AddKernel("winograd_input_transform_2x2", - // wino_kernel_file); - // this->cl_helper_.AddKernel("matmul", "matmul.cl"); - // this->cl_helper_.AddKernel("winograd_output_transform_2x2", - // wino_kernel_file); - // - // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); - // - // } else { - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - // std::cout << " input dim " << param->Input()->dims()[0] << " " - // << param->Input()->dims()[1] << " " << - // param->Input()->dims()[2] - // << " " << param->Input()->dims()[3] << " " << std::endl; - // std::cout << " output dim " << param->Output()->dims()[0] << " " - // << param->Output()->dims()[1] << " " << - // param->Output()->dims()[2] - // << " " << param->Output()->dims()[3] << " " << std::endl; - // std::cout << " filter dim " << param->Filter()->dims()[0] << " " - // << param->Filter()->dims()[1] << " " << - // param->Filter()->dims()[2] - // << " " << param->Filter()->dims()[3] << " " << std::endl; - if (param->groups > 1) { - param->ExecMode() = - ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT; - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file); - } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file); - } - - // } - DLOG << "conv 3x3"; - } else if (param->Filter()->dims()[2] == 7 && - param->Filter()->dims()[3] == 7) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file); - // } - DLOG << "conv 7x7"; - } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); - } - - return true; -} - -template <> -void ConvKernel::Compute(const ConvParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<4, 3>(&this->cl_helper_, param); - break; - case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: - case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: - ConvAddBnRelu(&this->cl_helper_, param); - break; - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - DWConvAddBnRelu(&this->cl_helper_, param); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: - SWConvAddBnRelu(&this->cl_helper_, param); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } -} - -template class ConvKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp deleted file mode 100644 index 35511331a5..0000000000 --- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVRELU_OP - -#include "operators/kernel/conv_relu_kernel.h" -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvReluKernel::Init(FusionConvReluParam *param) { - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - param->SetOffset(offset); - - DLOG << " init helper: " << &cl_helper_; - DLOG << " conv kernel add kernel ~ "; - DLOG << " width of one block: " << param->Filter()->dims()[3]; - DLOG << " height of one block: " << param->Filter()->dims()[2]; - DLOG << " filter dims: " << param->Filter()->dims(); - - const std::string conv_kernel_file = "conv_kernel.cl"; - const std::string wino_kernel_file = "winograd_transform.cl"; - const std::string build_options = "-DRELU"; - - if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT; - param->Filter()->InitNImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - if (param->Input()->dims()[1] % 4 == 0) { - this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, - build_options); - } else { - this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, - build_options); - } - DLOG << "conv 1x1"; - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file, - build_options); - } else { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; - this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file, - build_options); - } - - DLOG << "depth_conv 3x3"; - - } else if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] != 3) { - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; - this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); - } else if (param->Filter()->dims()[2] == 3 && - param->Filter()->dims()[3] == 3) { - // if (param->Strides()[0] == param->Strides()[1] && - // param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) { - // param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; - // this->cl_helper_.AddKernel("winograd_filter_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("winograd_input_transform_2x2", - // wino_kernel_file, build_options); - // this->cl_helper_.AddKernel("matmul", "matmul.cl", build_options); - // this->cl_helper_.AddKernel("winograd_output_transform_2x2", - // wino_kernel_file, build_options); - // - // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); - // - // } else { - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - if (param->groups > 1) { - param->ExecMode() = - ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT; - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); - } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, - build_options); - } - // } - DLOG << "conv 3x3"; - - } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); - } - - return true; -} - -template <> -void ConvReluKernel::Compute( - const FusionConvReluParam ¶m) { - switch (param.ExecMode()) { - case ConvParam::EXEC_WINOGRAD3X3_FLOAT: - WinogradConv3x3<4, 3>(&this->cl_helper_, param, true); - break; - case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT: - case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: - case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: - ConvAddBnRelu(&this->cl_helper_, param, true); - break; - case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - DWConvAddBnRelu(&this->cl_helper_, param, true); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: - SWConvAddBnRelu(&this->cl_helper_, param, true); - break; - case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: - SWConvAddBnRelu(&this->cl_helper_, param, true); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", - param.ExecMode()); - } -} - -template class ConvReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp deleted file mode 100644 index 4261681f3e..0000000000 --- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef CONV_TRANSPOSE_OP - -#include "operators/kernel/conv_transpose_kernel.h" -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvTransposeKernel::Init( - ConvTransposeParam* param) { - PADDLE_MOBILE_ENFORCE(param->Strides()[0] == param->Strides()[1] && - param->Paddings()[0] == param->Paddings()[1] && - param->Dilations()[0] == param->Dilations()[1] && - param->Dilations()[0] == 1, - "need equal"); - - if (param->Filter()->dims()[1] == 1 && - param->Input()->dims()[1] == param->Output()->dims()[1]) { - param->ExecMode() = ConvTransposeParam::EXEC_DEPTHWISETRANS_FLOAT; - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("depthwise_transpose", - "conv_transpose_kernel.cl"); - } else if (param->Filter()->dims()[2] == 3 && - param->Filter()->dims()[3] == 3 && param->Strides()[0] == 2) { - param->ExecMode() = ConvTransposeParam::EXEC_CONVTRANS3x3s2_FLOAT; - param->Filter()->InitConv2dTransposeFilterCLImage( - cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_transpose3x3s2", - "conv_transpose_kernel.cl"); - } else { - param->ExecMode() = ConvTransposeParam::EXEC_CONVTRANS_FLOAT; - param->Filter()->InitConv2dTransposeFilterCLImage( - cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_transpose", "conv_transpose_kernel.cl"); - } - return true; -} - -template <> -void ConvTransposeKernel::Compute( - const ConvTransposeParam& param) { - switch (param.ExecMode()) { - case ConvTransposeParam::EXEC_DEPTHWISETRANS_FLOAT: - DWConvTransposeAddBnRelu(&this->cl_helper_, param); - break; - case ConvTransposeParam::EXEC_CONVTRANS3x3s2_FLOAT: - ConvTranspose3x3s2AddBnRelu(&this->cl_helper_, param); - break; - case ConvTransposeParam::EXEC_CONVTRANS_FLOAT: - ConvTransposeAddBnRelu(&this->cl_helper_, param); - break; - default: - PADDLE_MOBILE_THROW_EXCEPTION( - "Invalid convolution transpose execute mode %d", param.ExecMode()); - } -} - -template class ConvTransposeKernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp b/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp deleted file mode 100644 index 1a5cf0f061..0000000000 --- a/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DENSITY_PRIORBOX_OP - -#include -#include "framework/cl/cl_tensor.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool DensityPriorBoxKernel::Init( - paddle_mobile::operators::DensityPriorBoxParam - *param) { - this->cl_helper_.AddKernel("density_prior_box", - "density_prior_box_kernel.cl"); - vector fixed_sizes = param->FixedSizes(); - vector fixed_ratios = param->FixedRatios(); - vector densities = param->Densities(); - vector variances = param->Variances(); - int fix_ratio_size = fixed_ratios.size(); - int total_size = densities.size() + fixed_sizes.size() + fix_ratio_size; - float *densities_data = new float[total_size]; - for (int i = 0; i < densities.size(); ++i) { - float density = densities[i]; - densities_data[i] = density; - } - - for (int k = 0; k < fixed_sizes.size(); ++k) { - densities_data[k + densities.size()] = fixed_sizes[k]; - } - - for (int j = 0; j < fixed_ratios.size(); ++j) { - float sqrt_ratios = sqrt(fixed_ratios[j]); - densities_data[j + densities.size() + fixed_sizes.size()] = sqrt_ratios; - } - - framework::CLImage *new_density = new framework::CLImage(); - new_density->SetTensorData(densities_data, {1, 1, 1, total_size}); - new_density->InitCLImage(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - param->setNewDensity(new_density); - - delete[](densities_data); - - return true; -} - -template <> -void DensityPriorBoxKernel::Compute( - const paddle_mobile::operators::DensityPriorBoxParam - ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - const auto *input = param.Input(); - const auto input_dims = input->dims(); - const auto input_image_dims = param.InputImage()->dims(); - - auto output_boxes = param.OutputBoxes()->GetCLImage(); - auto output_var = param.OutputVariances()->GetCLImage(); - auto new_density = param.getNewDensity()->GetCLImage(); - - float step_w = param.StepW(); - float step_h = param.StepH(); - float offset = param.Offset(); - vector fixed_sizes = param.FixedSizes(); - vector fixed_ratios = param.FixedRatios(); - vector densities = param.Densities(); - vector variances = param.Variances(); - - // feature map - auto input_heigh = input_dims[2]; - auto input_width = input_dims[3]; - - auto image_heigh = input_image_dims[2]; - auto image_width = input_image_dims[3]; - - const int C = param.OutputBoxes()->dims()[1]; - - if (step_w == 0 || step_h == 0) { - step_h = static_cast(image_heigh) / input_heigh; - step_w = static_cast(image_width) / input_width; - } - int num_density = 0; - for (int l = 0; l < densities.size(); ++l) { - num_density += densities[l] * densities[l] * fixed_ratios.size(); - } - - param.OutputBoxes()->Resize({input_heigh, input_width, num_density, 4}); - int step_average = static_cast((step_w + step_h) * 0.5); - int densities_and_fixedsize_size = densities.size(); - int fix_ratio_size = fixed_ratios.size(); - - auto default_work = this->cl_helper_.DefaultWorkSize(*param.OutputBoxes()); - - float variances0 = variances[0]; - float variances1 = variances[1]; - float variances2 = variances[2]; - float variances3 = variances[3]; - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output_boxes); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_var); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_density); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(float), &step_h); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(float), &step_w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &variances0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(int), &variances1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &variances2); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(int), &variances3); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(float), &offset); - CL_CHECK_ERRORS(status); - status = - clSetKernelArg(kernel, 10, sizeof(int), &densities_and_fixedsize_size); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &image_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &image_heigh); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 13, sizeof(int), &C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 14, sizeof(int), &num_density); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 15, sizeof(int), &step_average); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 16, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 17, sizeof(int), &default_work[0]); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 18, sizeof(int), &fix_ratio_size); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, - default_work.size(), NULL, - default_work.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp deleted file mode 100644 index 372c25b596..0000000000 --- a/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp +++ /dev/null @@ -1,96 +0,0 @@ -///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. */ -// -//#ifdef DEQUANT_OP -// -//#include "operators/kernel/dequantize_kernel.h" -// -// namespace paddle_mobile { -// namespace operators { -// -// template <> -// bool DequantizeKernel::Init(DequantizeParam *param) { -// DLOG << " depthwise conv kernel init begin "; -// PADDLE_MOBILE_ENFORCE( -// param->Filter()->dims()[2] == param->Filter()->dims()[3] && -// param->Paddings()[0] == param->Paddings()[1], -// "need equal"); -// param->Filter()->InitCLImage(cl_helper_.CLContext(), -// this->cl_helper_.CLCommandQueue()); -// int offset = static_cast(param->Filter()->dims()[2]) / 2 - -// static_cast(param->Paddings()[1]); -// param->SetOffset(offset); -// this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); -// DLOG << " depthwise conv kernel init end "; -// return true; -//} -// -// template <> -// void DequantizeKernel::Compute( -// const DequantizeParam ¶m) { -// auto kernel = this->cl_helper_.KernelAt(0); -// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); -// int c_block = default_work_size[0]; -// int w = default_work_size[1]; -// int nh = default_work_size[2]; -// auto input = param.Input()->GetCLImage(); -// auto filter = param.Filter()->GetCLImage(); -// auto output = param.Output()->GetCLImage(); -// int stride = param.Strides()[0]; -// int offset = param.Offset(); -// int input_c = reinterpret_cast( -// param.Input()->Converter()) -// ->GetCBlock(); -// int dilation = param.Dilations()[0]; -// -// int input_width = param.Input()->dims()[3]; -// int input_height = param.Input()->dims()[2]; -// int output_width = param.Output()->dims()[3]; -// int output_height = param.Output()->dims()[2]; -// -// cl_int status; -// -// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); -// status = clSetKernelArg(kernel, 1, sizeof(int), &w); -// status = clSetKernelArg(kernel, 2, sizeof(int), &nh); -// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); -// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); -// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); -// status = clSetKernelArg(kernel, 6, sizeof(int), &stride); -// status = clSetKernelArg(kernel, 7, sizeof(int), &offset); -// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); -// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); -// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); -// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); -// status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); -// status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); -// -// CL_CHECK_ERRORS(status); -// -// // cl_event out_event = param.Output()->GetClEvent(); -// // cl_event wait_event = param.Input()->GetClEvent(); -// -// status = clEnqueueNDRangeKernel( -// this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), -// NULL, default_work_size.data(), NULL, 0, NULL, NULL); -// -// CL_CHECK_ERRORS(status); -//} -// -// template class DepthwiseConvKernel; -// -//} // namespace operators -//} // namespace paddle_mobile -// -//#endif diff --git a/mobile/src/operators/kernel/cl/dropout_kernel.cpp b/mobile/src/operators/kernel/cl/dropout_kernel.cpp deleted file mode 100644 index db9437841b..0000000000 --- a/mobile/src/operators/kernel/cl/dropout_kernel.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DROPOUT_OP - -#include "operators/kernel/dropout_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DropoutKernel::Init(DropoutParam *param) { - this->cl_helper_.AddKernel("dropout", "dropout_kernel.cl"); - return true; -} - -template <> -void DropoutKernel::Compute(const DropoutParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); - auto *input_image = param.InputX()->GetCLImage(); - auto *output_image = param.Out()->GetCLImage(); - const float dropoutProb = param.DropoutProb(); - const auto &inputDim = param.InputX()->dims(); - int input_dims[4] = {1, 1, 1, 1}; - // 1 1000 1 1 - for (int i = 0; i < inputDim.size(); i++) { - input_dims[4 - inputDim.size() + i] = inputDim[i]; - } - int out_W = input_dims[1]; - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(float), &dropoutProb); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp deleted file mode 100644 index 03362a8d9f..0000000000 --- a/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DWCONVBNRELU_OP - -#include "operators/kernel/dwconv_bn_relu_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool DWConvBNReluKernel::Init( - FusionDWConvBNReluParam *param) { - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - const framework::CLImage *mean = param->InputMean(); - const framework::CLImage *variance = param->InputVariance(); - const framework::CLImage *scale = param->InputScale(); - const framework::CLImage *bias = param->InputBias(); - const float epsilon = param->Epsilon(); - - const int C = mean->numel(); - - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - - float inv_std_ptr[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - float *new_scale_ptr = new float[C]; - float *new_bias_ptr = new float[C]; - - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; - } - - framework::CLImage *new_scale = new framework::CLImage(); - - new_scale->SetTensorData(new_scale_ptr, variance->dims()); - new_scale->InitCLImage(this->cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - framework::CLImage *new_bias = new framework::CLImage(); - - new_bias->SetTensorData(new_bias_ptr, variance->dims()); - new_bias->InitCLImage(this->cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); - - delete[](new_scale_ptr); - delete[](new_bias_ptr); - - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - - param->SetOffset(offset); - - param->Filter()->InitDWImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("depth_conv_3x3", "conv_bn_relu_kernel.cl"); - DLOG << " conv bn relu depth_conv_3x3"; - - return true; -} - -template <> -void DWConvBNReluKernel::Compute( - const FusionDWConvBNReluParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - auto new_scale = param.NewScale()->GetCLImage(); - auto new_bias = param.NewBias()->GetCLImage(); - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - - cl_int status; - - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 15, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} -template class DWConvBNReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp deleted file mode 100644 index 06d718601c..0000000000 --- a/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEADD_OP - -#include "operators/kernel/elementwise_add_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseAddKernel::Init( - ElementwiseAddParam *param) { - DLOG << "-----init add-----"; - CLImage *bias = - reinterpret_cast(const_cast(param->InputY())); - if (bias->dims().size() == 4) { - if (!bias->isInit()) { - bias->InitNormalCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - } - DLOG << " bias: " << *bias; - this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); - } else if (param->InputY()->dims().size() == 1) { - if (param->Axis() == param->InputX()->dims().size() - 1) { - if (!bias->isInit()) { - bias->InitNormalCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - } - DLOG << " bias: " << *bias; - this->cl_helper_.AddKernel("width_add", "channel_add_kernel.cl"); - } else if (param->Axis() == param->InputX()->dims().size() - 3) { - if (!bias->isInit()) { - bias->InitCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - } - DLOG << " bias: " << *bias; - this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl"); - } else { - DLOG << "error:bias dims is error"; - } - } else { - DLOG << "error:bias dims is error"; - } - return true; -} - -template <> -void ElementwiseAddKernel::Compute( - const ElementwiseAddParam ¶m) { - auto input = param.InputX(); - auto bias = param.InputY(); - auto output = param.Out(); - cl_int status; - auto kernel = this->cl_helper_.KernelAt(0); - if (bias->dims().size() == 4) { - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 1) { - if (param.Axis() == param.InputX()->dims().size() - 1 || - param.Axis() == param.InputX()->dims().size() - 3) { - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - DLOG << "dede:" << width << "," << height; - size_t global_work_size[2] = {width, height}; - cl_event out_event = param.Out()->GetClEvent(); - cl_event wait_event = param.InputX()->GetClEvent(); - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else { - DLOG << "error:bias dims is error"; - } - } else { - DLOG << "error:bias dims is error"; - } -} - -template class ElementwiseAddKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp deleted file mode 100644 index 51a213026b..0000000000 --- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEMUL_OP - -#include "operators/kernel/elementwise_mul_kernel.h" -#include -#include -#include "framework/cl/cl_image.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseMulKernel::Init( - ElementwiseMulParam *param) { - framework::CLImage *bias = reinterpret_cast( - const_cast(param->InputY())); - if (bias->dims() == param->InputX()->dims()) { - DLOG << "init element wise mul"; - this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl"); - } else { - const int bias_dim_size = bias->dims().size(); - if (bias_dim_size == 1) { - DLOG << "init channel_mul"; - this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); - } else if (bias_dim_size == 2) { - // etc. input 1 72 28 28 - // filter 1 72 - DLOG << "init channel_mul_d2"; - this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); - } else if (bias_dim_size == 3) { - DLOG << "init channel_mul_d3"; - this->cl_helper_.AddKernel("channel_mul_d3", "elementwise_mul_kernel.cl"); - } else if (bias_dim_size == 4) { - DLOG << "init channel_mul_d4"; - this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl"); - } else { - PADDLE_MOBILE_ENFORCE(false, - "element mul not supported this situation yet"); - } - } - return true; -} -template <> -void ElementwiseMulKernel::Compute( - const ElementwiseMulParam ¶m) { - auto input = param.InputX(); - auto bias = param.InputY(); - auto output = param.Out(); - cl_int status; - auto kernel = this->cl_helper_.KernelAt(0); - if (bias->dims() == input->dims()) { - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else { - const int bias_dim_size = bias->dims().size(); - if (bias_dim_size == 1) { - DLOG << "channel mul"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else if (bias_dim_size == 2) { - DLOG << "channel mul d2"; - - // etc. input 1 72 28 28 - // filter 1 72 --> 1 1 1 72 - DLOG << "input->ImageDims(): " << input->ImageDims(); - DLOG << "bias->ImageDims(): " << bias->ImageDims(); - DLOG << "out->ImageDims(): " << output->ImageDims(); - - DLOG << "channel mul d2"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - - // bias->PrintTensor(*bias); - } else if (bias_dim_size == 3) { - DLOG << "channel_mul_d3"; - // etc. input 1 72 28 28 - // filter 1 72 --> 1 1 1 72 - DLOG << "input->ImageDims(): " << input->ImageDims(); - DLOG << "bias->ImageDims(): " << bias->ImageDims(); - DLOG << "out->ImageDims(): " << output->ImageDims(); - - DLOG << "channel mul d3"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else if (bias_dim_size == 4) { - DLOG << "channel_mul_d4"; - // etc. input 1 72 28 28 - // filter 1 72 --> 1 1 1 72 - DLOG << "input->ImageDims(): " << input->ImageDims(); - DLOG << "bias->ImageDims(): " << bias->ImageDims(); - DLOG << "out->ImageDims(): " << output->ImageDims(); - - DLOG << "channel mul d4"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else { - PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") - } - } -} - -template class ElementwiseMulKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp deleted file mode 100644 index b107b3de3c..0000000000 --- a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISESUB_OP - -#include "operators/kernel/elementwise_sub_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseSubKernel::Init( - ElementwiseSubParam *param) { - framework::CLImage *bias = reinterpret_cast( - const_cast(param->InputY())); - if (bias->dims().size() == 4) { - if (!bias->isInit()) { - bias->InitNormalCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - } - DLOG << " bias: " << *bias; - this->cl_helper_.AddKernel("elementwise_sub", "elementwise_sub_kernel.cl"); - } else { - DLOG << "error:bias dims not support"; - } - return true; -} - -template <> -void ElementwiseSubKernel::Compute( - const ElementwiseSubParam ¶m) { - auto input = param.InputX(); - auto bias = param.InputY(); - auto output = param.Out(); - cl_int status; - auto kernel = this->cl_helper_.KernelAt(0); - if (bias->dims().size() == 4) { - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bias_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else { - DLOG << "error:bias dims not support"; - } -} - -template class ElementwiseSubKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/exp_kernel.cpp b/mobile/src/operators/kernel/cl/exp_kernel.cpp deleted file mode 100644 index 76cbae1efd..0000000000 --- a/mobile/src/operators/kernel/cl/exp_kernel.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef EXP_OP - -#include -#include -namespace paddle_mobile { -namespace operators { - -template <> -bool EXPKernel::Init( - paddle_mobile::operators::EXPParam* param) { - this->cl_helper_.AddKernel("exp_impl", "exp_kernel.cl"); - return true; -} - -template <> -void EXPKernel::Compute( - const paddle_mobile::operators::EXPParam& param) { - auto kernel = this->cl_helper_.KernelAt(0); - const auto* input = param.InputX(); - auto* output = param.Out(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -template class EXPKernel; -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/expand_kernel.cpp b/mobile/src/operators/kernel/cl/expand_kernel.cpp deleted file mode 100644 index f424a31b4f..0000000000 --- a/mobile/src/operators/kernel/cl/expand_kernel.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef EXPAND_OP - -#include "operators/kernel/expand_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ExpandKernel::Init(ExpandParam* param) { - const framework::DDim& input_dims = param->InputX()->dims(); - PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, - "expend now support 4 size dims"); - if (input_dims[1] == 1) { - this->cl_helper_.AddKernel("expend_c1", "expend.cl"); - } else if (input_dims[1] == 2) { - this->cl_helper_.AddKernel("expend_c2", "expend.cl"); - } else if (input_dims[1] == 4) { - this->cl_helper_.AddKernel("expend_c4", "expend.cl"); - } else { - PADDLE_MOBILE_ENFORCE(false, "expend did not supported this type"); - } - return true; -} - -template <> -void ExpandKernel::Compute(const ExpandParam& param) { - auto kernel = this->cl_helper_.KernelAt(0); - DLOG << "param.Out()->dims(): " << param.Out()->dims(); - const framework::DDim& image_dims = param.Out()->ImageDims(); - DLOG << "param.Out()->image_dims(): " << image_dims; - - auto out_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - DLOG << "out_work_size: " << out_work_size; - - int out_c_block = out_work_size[0]; - int out_w = out_work_size[1]; - int out_nh = out_work_size[2]; - - auto in_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); - int in_c_block = in_work_size[0]; - int in_w = in_work_size[1]; - int in_nh = in_work_size[2]; - - int input_width = param.InputX()->dims()[3]; - int input_height = param.InputX()->dims()[2]; - int output_width = param.Out()->dims()[3]; - int output_height = param.Out()->dims()[2]; - - const auto* input = param.InputX(); - auto* output = param.Out(); - vector expandTimes = {1, 1, 1, 1}; - DLOG << "param.expand_times: " << param.expand_times; - - for (int i = 0; i < param.expand_times.size(); ++i) { - expandTimes[i] = param.expand_times[i]; - } - - DLOG << "expandTimes: " << expandTimes; - - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - - input->dims(); - - int idx = 0; - - cl_int status; - status = clSetKernelArg(kernel, idx++, sizeof(int), &out_c_block); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &out_w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &out_nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, idx++, sizeof(int), &in_c_block); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &in_w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &in_nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, idx++, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[0]); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[1]); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[2]); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[3]); - CL_CHECK_ERRORS(status); - - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - out_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - - DLOG << *output; -} - -template class ExpandKernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/feed_kernel.cpp b/mobile/src/operators/kernel/cl/feed_kernel.cpp deleted file mode 100644 index f960595934..0000000000 --- a/mobile/src/operators/kernel/cl/feed_kernel.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/feed_kernel.h" -#include "framework/cl/cl_tensor.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FeedKernel::Init(FeedParam *param) { - DLOG << "Init feed"; - if (this->pre_post_type_ == UINT8_255) { - this->cl_helper_.AddKernel("feed_with_pre", "feed_kernel.cl"); - } else { - this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); - } - return true; -} - -template <> -void FeedKernel::Compute(const FeedParam ¶m) { - const int col = param.Col(); - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); - cl_int status; - auto output = param.Out(); - const Tensor *input = ¶m.InputX()->at(col); - // DLOG << *input; - - int numel = input->numel(); - cl_mem output_image = output->GetCLImage(); - const int out_C = output->dims()[1]; - const int out_H = output->dims()[2]; - const int out_W = output->dims()[3]; - const int Stride2 = out_C * out_H * out_W; - const int Stride1 = out_H * out_W; - const int Stride0 = out_W; - framework::CLTensor input_cl_tensor(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - input_cl_tensor.Resize(input->dims()); - cl_mem inputBuffer; - if (this->pre_post_type_ == UINT8_255) { - inputBuffer = - input_cl_tensor.mutable_with_data(input->data()); - } else { - inputBuffer = - input_cl_tensor.mutable_with_data(input->data()); - } - - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - - CL_CHECK_ERRORS(status); -} - -template class FeedKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/cl/fetch_kernel.cpp b/mobile/src/operators/kernel/cl/fetch_kernel.cpp deleted file mode 100644 index df2c2e1f5c..0000000000 --- a/mobile/src/operators/kernel/cl/fetch_kernel.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/fetch_kernel.h" -#include "framework/cl/cl_tensor.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FetchKernel::Init(FetchParam *param) { - if (this->pre_post_type_ == UINT8_255) { - this->cl_helper_.AddKernel("fetch_with_post", "fetch_kernel.cl"); - } else { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); - } - return true; -} - -template <> -void FetchKernel::Compute(const FetchParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); - - const int col = param.Col(); - auto input = param.InputX()->GetCLImage(); - auto *out = ¶m.Out()->at(col); - out->Resize(param.InputX()->dims()); - - DLOG << "fetch kernel out dims = " << out->dims(); - DLOG << "fetch kernel out memory size = " << out->memory_size(); - - auto dim = param.InputX()->dims(); - size_t new_dims[] = {1, 1, 1, 1}; - - for (int j = 0; j < dim.size(); ++j) { - new_dims[4 - dim.size() + j] = dim[j]; - } - - size_t in_ch, in_height, in_width; - - in_ch = new_dims[1]; - in_height = new_dims[2]; - in_width = new_dims[3]; - int size_ch = in_height * in_width; - int size_block = size_ch * 4; - int size_batch = size_ch * in_ch; - - framework::CLTensor out_cl_tensor(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - out_cl_tensor.Resize(out->dims()); - cl_mem outBuffer; - if (this->pre_post_type_ == UINT8_255) { - out->mutable_data(); - outBuffer = out_cl_tensor.mutable_data(); - } else { - out->mutable_data(); - outBuffer = out_cl_tensor.mutable_data(); - } - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(int), &in_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(int), &in_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &size_ch); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &size_block); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(int), &size_batch); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &in_ch); - CL_CHECK_ERRORS(status); - - // cl_event wait_event = param.InpdutX()->GetClEvent(); - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - - clFinish(this->cl_helper_.CLCommandQueue()); - - DLOG << "fetch kernel out dims = " << out->dims(); - DLOG << "fetch kernel out memory size = " << out->memory_size(); - - DLOG << "fetch kernel out_cl_tensor dims = " << out_cl_tensor.dims(); - DLOG << "fetch kernel out_cl_tensor memery size = " - << out_cl_tensor.memory_size(); - if (this->pre_post_type_ == UINT8_255) { - memcpy(out->data(), out_cl_tensor.Data(), - sizeof(uint8_t) * out->numel()); - } else { - memcpy(out->data(), out_cl_tensor.Data(), - sizeof(float) * out->numel()); - } -} - -template class FetchKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/cl/flatten2_kernel.cpp b/mobile/src/operators/kernel/cl/flatten2_kernel.cpp deleted file mode 100644 index 43eeffe072..0000000000 --- a/mobile/src/operators/kernel/cl/flatten2_kernel.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FLATTEN2_OP - -#include "operators/kernel/flatten2_kernel.h" -#include -namespace paddle_mobile { -namespace operators { - -template <> -bool Flatten2Kernel::Init( - paddle_mobile::operators::FlattenParam *param) { - this->cl_helper_.AddKernel("flatten2", "flatten2_kernel.cl"); - return true; -} - -template <> -void Flatten2Kernel::Compute( - const paddle_mobile::operators::FlattenParam - ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - const auto *input = param.InputX(); - auto *output = param.Out(); - auto input_image = input->GetCLImage(); - auto output_image = output->GetCLImage(); - - int in_width = input->dims()[3]; - int in_height = input->dims()[2]; - int in_c = input->dims()[1]; - - int out_width = output->dims()[1]; - DLOG << "flatten2 dims :" << output->dims() << " in: " << input->dims(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); - DLOG << "flatten2 work size :" << default_work_size.data()[0] << " " - << default_work_size.data()[1] << " " << default_work_size.data()[2] - << " " << default_work_size.size(); - - // const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()}; - DLOG << "flatten2 work data :" << output->ImageWidth() << " " - << output->ImageHeight(); - - DLOG << "flatten2 work data 4:" << out_width << " " << in_width << " " - << in_height << " " << in_c; - - int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &in_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &in_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &in_c); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp deleted file mode 100644 index de6a0455b9..0000000000 --- a/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_FC_OP - -#include "operators/kernel/fusion_fc_kernel.h" -#include "operators/math/math_function.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FusionFcKernel::Init(FusionFcParam *param) { - param->InputY()->InitNormalCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - param->InputZ()->InitNormalCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); - this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); - return true; -} - -template -void FusionFcCompute(const FusionFcParam ¶m, cl_context context, - cl_command_queue commandQueue, cl_kernel kernel0, - cl_kernel kernel1) { - auto *input_x_image = param.InputX(); - auto *input_y_image = param.InputY(); - auto *input_z_image = param.InputZ(); - - int axis = param.Axis(); - auto *out_image = param.Out(); - - Tensor *input_x = new Tensor(); - input_x->Resize(input_x_image->dims()); - input_x->mutable_data(); - framework::CLImageToTensor(input_x_image, input_x, context, commandQueue, - kernel0); - - Tensor *input_y = new Tensor(); - input_y->Resize(input_y_image->dims()); - input_y->mutable_data(); - framework::CLImageToTensor(input_y_image, input_y, context, commandQueue, - kernel0); - - Tensor *input_z = new Tensor(); - input_z->Resize(input_z_image->dims()); - input_z->mutable_data(); - framework::CLImageToTensor(input_z_image, input_z, context, commandQueue, - kernel0); - auto *input_z_data = input_z->data(); - - DLOG << *input_x; - DLOG << *input_y; - DLOG << *input_z; - - Tensor *out = new Tensor(); - out->Resize(out_image->dims()); - out->mutable_data(); - auto *out_data = out->mutable_data(); - - const Tensor x_matrix = - input_x->dims().size() > 2 - ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) - : *input_x; - const Tensor y_matrix = - input_y->dims().size() > 2 - ? framework::ReshapeToMatrix(*input_y, param.YNumColDims()) - : *input_y; - auto out_dim = out->dims(); - if (out_dim.size() != 2) { - out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); - } - PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); - PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1"); - PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0], - " out_dim.size must be 2."); - axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis); - PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. "); - - int64_t classes = input_z->numel(); - for (int i = 0; i < out_dim[0]; i++) { - memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes); - } - - math::MatMul(x_matrix, false, y_matrix, false, - static_cast(1), out, static_cast(1), - false); - - // out_image->InitEmptyImage(context, commandQueue, out->dims()); - framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1); - - delete (input_x); - delete (input_y); - delete (input_z); - delete (out); - PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); -} - -template <> -void FusionFcKernel::Compute( - const FusionFcParam ¶m) { - auto kernel0 = this->cl_helper_.KernelAt(0); - auto kernel1 = this->cl_helper_.KernelAt(1); - FusionFcCompute(param, this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), kernel0, kernel1); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/gen_code.py b/mobile/src/operators/kernel/cl/gen_code.py deleted file mode 100644 index 888c06e9a4..0000000000 --- a/mobile/src/operators/kernel/cl/gen_code.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -import os -import sys - -def gen_opencl_kernels(): - source = """ - #pragma - #ifdef PADDLE_MOBILE_CL - #include - #include - #include - namespace paddle_mobile { - // func name => source - extern const std::map> opencl_kernels = { - %s - }; - // file name => header - extern const std::map> opencl_headers = { - %s - }; - } - #endif - """ - - def string_to_hex(str): - hex_list = [] - for i in range(len(code_str)): - hex_ = hex(ord(code_str[i])) - hex_list.append(hex_) - return hex_list - - def clean_source(content): - new_content = re.sub(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/", "", content, flags=re.DOTALL) - lines = new_content.split("\n") - new_lines = [] - for i in range(len(lines)): - line = lines[i] - line = re.sub(r"//.*$", "", line) - line = line.strip() - if line == "": - continue - new_lines.append(line) - new_content = "\n".join(new_lines) - return new_content - - infile = open("cl_kernel/cl_common.h", "r") - common_content = infile.read() - infile.close() - common_content = clean_source(common_content) - - infile = open("cl_kernel/conv_kernel.inc.cl", "r") - inc_content = infile.read() - infile.close() - inc_content = clean_source(inc_content) - - def get_header_raw(content): - lines = content.split("\n") - new_lines = [] - for line in lines: - if "__kernel void" in line: - break - new_lines.append(line) - header = "\n".join(new_lines) - return header - common_header = get_header_raw(common_content) - inc_header = get_header_raw(inc_content) - - def get_header(content): - lines = content.split("\n") - new_lines = [] - for line in lines: - if "__kernel void" in line: - break - new_lines.append(line) - for i in range(len(new_lines)): - if "#include \"conv_kernel.inc.cl\"" in new_lines[i]: - new_lines[i] = inc_header - header = "\n".join(new_lines) - new_lines = header.split("\n") - for i in range(len(new_lines)): - if "#include \"cl_common.h\"" in new_lines[i]: - new_lines[i] = common_header - header = "\n".join(new_lines) - return header - - def get_funcs(content): - funcs = {} - lines = content.split("\n") - first_kernel_idx = None - for i in range(len(lines)): - if "__kernel void" in lines[i]: - first_kernel_idx = i - break - if first_kernel_idx is None: - return funcs - lines = lines[first_kernel_idx:] - func = [] - name = "" - for line in lines: - if "__kernel void" in line: - if name != "": - funcs[name] = "\n".join(func) - name = "" - func = [] - pattern = re.compile("__kernel void ([^(]+)\(") - match = pattern.search(line) - name = match.group(1) - func.append(line) - if name != "": - funcs[name] = "\n".join(func) - name = "" - func = [] - return funcs - - filenames = os.listdir("cl_kernel") - file_count = len(filenames) - - headers = {} - funcs = {} - for i in range(file_count): - filename = filenames[i] - infile = open("cl_kernel/" + filename, "r") - content = infile.read() - infile.close() - content = clean_source(content) - header = get_header(content) - headers[filename] = header - funcs_temp = get_funcs(content) - for key in funcs_temp: - funcs[key] = funcs_temp[key] - - core1 = "" - core2 = "" - - for i in range(len(funcs)): - func_name = list(funcs.keys())[i] - content = funcs[func_name] - if content == "": - content = " " - hexes = [] - for char in content: - hexes.append(hex(ord(char))) - core = " {\"%s\", {" % func_name - for item in hexes: - core += str(item) + ", " - core = core[: -2] - core += "}}" - if i != len(funcs) - 1: - core += ",\n" - core1 += core - - for i in range(len(headers)): - file_name = list(headers.keys())[i] - content = headers[file_name] - if content == "": - content = " " - hexes = [] - for char in content: - hexes.append(hex(ord(char))) - core = " {\"%s\", {" % file_name - for item in hexes: - core += str(item) + ", " - core = core[: -2] - core += "}}" - if i != len(headers) - 1: - core += ",\n" - core2 += core - source = source % (core1, core2) - print(source) - -def gen_empty_opencl_kernels(): - source = """ - #pragma - #ifdef PADDLE_MOBILE_CL - #include - #include - #include - namespace paddle_mobile { - // func name => source - extern const std::map> opencl_kernels = { - }; - // file name => header - extern const std::map> opencl_headers = { - }; - } - #endif - """ - print(source) - -if __name__ == "__main__": - if sys.argv[1] == "0": - gen_empty_opencl_kernels() - elif sys.argv[1] == "1": - gen_opencl_kernels() diff --git a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp deleted file mode 100644 index 3a20ebd94e..0000000000 --- a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef GRID_SAMPLER_OP - -#include "operators/kernel/grid_sampler_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool GridSamplerKernel::Init(GridSamplerParam* param) { - this->cl_helper_.AddKernel("grid_sampler", "grid_sampler_kernel.cl"); - return true; -} - -template <> -void GridSamplerKernel::Compute( - const GridSamplerParam& param) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Output())); - cl_int status; - auto output = param.Output(); - auto input = param.InputX(); - auto grid = param.Grid(); - auto output_image = output->GetCLImage(); - auto input_image = input->GetCLImage(); - auto grid_image = grid->GetCLImage(); - const int out_H = output->dims()[2]; - const int out_W = output->dims()[3]; - - status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &grid_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - - const size_t work_size[3] = {default_work_size[0], default_work_size[1], - default_work_size[2] / 4}; - - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, - NULL, work_size, NULL, 0, NULL, NULL); - - CL_CHECK_ERRORS(status); -} - -template class GridSamplerKernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp deleted file mode 100644 index d0f377faee..0000000000 --- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INSTANCENORM_OP - -#include "operators/kernel/instancenorm_kernel.h" -#include -#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool InstanceNormKernel::Init(InstanceNormParam *param) { - auto &dims = param->OutputY()->dims(); - const int h = dims[2]; - std::string build_options = ""; - if (h == 128) { - build_options = "-DLOCAL_MEM_128"; - } else if (h == 64) { - build_options = "-DLOCAL_MEM_64"; - } - this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl", - build_options); - return true; -} - -template <> -void InstanceNormKernel::Compute( - const InstanceNormParam ¶m) { - InstanceNorm(&this->cl_helper_, param.InputX(), param.OutputY(), - param.Epsilon()); -} - -template class InstanceNormKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp deleted file mode 100644 index bd1d1f8742..0000000000 --- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_INSTANCENORM_RELU_OP - -#include "operators/kernel/instancenorm_relu_kernel.h" -#include -#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool InstanceNormReluKernel::Init( - FusionInstanceNormReluParam *param) { - auto &dims = param->Out()->dims(); - const int h = dims[2]; - std::string build_options = " -DRELU"; - if (h == 128) { - build_options += " -DLOCAL_MEM_128"; - } else if (h == 64) { - build_options += " -DLOCAL_MEM_64"; - } - this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl", - build_options); - return true; -} - -template <> -void InstanceNormReluKernel::Compute( - const FusionInstanceNormReluParam ¶m) { - InstanceNorm(&this->cl_helper_, param.InputX(), param.Out(), param.Epsilon()); -} - -template class InstanceNormReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp b/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp deleted file mode 100644 index 9487d57b2c..0000000000 --- a/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LEAKY_RELU_OP - -#include - -namespace paddle_mobile { -namespace operators { -template <> -bool LeakyReluKernel::Init( - paddle_mobile::operators::LeakyReluParam *param) { - this->cl_helper_.AddKernel("leakyrelu", "leakyrelu_kernel.cl"); - return true; -} - -template <> -void LeakyReluKernel::Compute( - const paddle_mobile::operators::LeakyReluParam - ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); - auto input = param.InputX(); - cl_mem input_image = input->GetCLImage(); - auto output = param.Out(); - cl_mem out_image = output->GetCLImage(); - float alpha = param.Alpha(); - int out_dims_w = output->dims()[3]; - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(float), &alpha); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &out_dims_w); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} -template class LeakyReluKernel; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/lrn_kernel.cpp b/mobile/src/operators/kernel/cl/lrn_kernel.cpp deleted file mode 100644 index e7e949e5ab..0000000000 --- a/mobile/src/operators/kernel/cl/lrn_kernel.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LRN_OP - -#include "operators/kernel/lrn_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool LrnKernel::Init(LrnParam *param) { - this->cl_helper_.AddKernel("lrn", "lrn_kernel.cl"); - return true; -} - -template <> -void LrnKernel::Compute(const LrnParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - - auto input_image = param.InputX()->GetCLImage(); - auto x_dims = param.InputX()->dims(); - auto output_image = param.Out()->GetCLImage(); - - const int N = x_dims[0]; - const int C = x_dims[1]; - const int H = x_dims[2]; - const int W = x_dims[3]; - - const int n = param.N(); - const float alpha = param.Alpha(); - const float beta = param.Beta(); - const float k = param.K(); - DLOG << "n=" << n; - DLOG << "alpha=" << alpha; - DLOG << "beta=" << beta; - DLOG << "k=" << k; - DLOG << default_work_size; - DLOG << C; - DLOG << W; - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &n); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(float), &k); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(float), &alpha); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(float), &beta); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/mul_kernel.cpp b/mobile/src/operators/kernel/cl/mul_kernel.cpp deleted file mode 100644 index 3a45babee0..0000000000 --- a/mobile/src/operators/kernel/cl/mul_kernel.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MUL_OP - -#include "operators/kernel/mul_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool MulKernel::Init(MulParam *param) { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); - this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); - return true; -} - -template -void MulCompute(const MulParam ¶m, cl_context context, - cl_command_queue commandQueue, cl_kernel kernel0, - cl_kernel kernel1) { - auto input_x = param.InputX(); - Tensor *input_x_tensor = new Tensor(); - input_x_tensor->Resize(input_x->dims()); - input_x_tensor->mutable_data(); - - framework::CLImageToTensor(input_x, input_x_tensor, context, commandQueue, - kernel0); - - auto input_y = param.InputY(); - Tensor input_y_tensor(input_y->data(), input_y->dims()); - - const Tensor x_matrix = - input_x_tensor->dims().size() > 2 - ? framework::ReshapeToMatrix(*input_x_tensor, param.XNumColDims()) - : *input_x_tensor; - const Tensor y_matrix = - input_y_tensor.dims().size() > 2 - ? framework::ReshapeToMatrix(input_y_tensor, param.YNumColDims()) - : input_y_tensor; - - auto out_dim = param.Out()->dims(); - if (out_dim.size() != 2) { - param.Out()->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); - } - - auto output = param.Out(); - Tensor *output_tensor = new Tensor(); - output_tensor->Resize(output->dims()); - output_tensor->mutable_data(); - math::MatMul(x_matrix, false, y_matrix, false, - static_cast(1), output_tensor, - static_cast(0)); - - // output->InitEmptyImage(context, commandQueue, output_tensor->dims()); - framework::TensorToCLImage(output_tensor, output, context, commandQueue, - kernel1); - - delete (input_x_tensor); - delete (output_tensor); -} - -template <> -void MulKernel::Compute(const MulParam ¶m) { - auto kernel0 = this->cl_helper_.KernelAt(0); - auto kernel1 = this->cl_helper_.KernelAt(1); - - MulCompute(param, this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), kernel0, kernel1); -} - -template class MulKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp b/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp deleted file mode 100644 index ce435b8997..0000000000 --- a/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP - -#include "operators/kernel/multiclass_nms_kernel.h" -#include -#include "operators/math/poly_util.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool MultiClassNMSKernel::Init( - MultiClassNMSParam* param) { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); - this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); - return true; -} -template -bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { - return pair1.first > pair2.first; -} - -template -static inline void GetMaxScoreIndex( - const std::vector& scores, const T threshold, int top_k, - std::vector>* sorted_indices) { - for (size_t i = 0; i < scores.size(); ++i) { - if (scores[i] > threshold) { - sorted_indices->push_back(std::make_pair(scores[i], i)); - } - } - // Sort the score pair according to the scores in descending order - std::stable_sort(sorted_indices->begin(), sorted_indices->end(), - SortScorePairDescend); - // Keep top_k scores if needed. - if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { - sorted_indices->resize(top_k); - } -} - -template -static inline T BBoxArea(const T* box, const bool normalized) { - if (box[2] < box[0] || box[3] < box[1]) { - // If coordinate values are is invalid - // (e.g. xmax < xmin or ymax < ymin), return 0. - return static_cast(0.); - } else { - const T w = box[2] - box[0]; - const T h = box[3] - box[1]; - if (normalized) { - return w * h; - } else { - // If coordinate values are not within range [0, 1]. - return (w + 1) * (h + 1); - } - } -} - -template -static inline T JaccardOverlap(const T* box1, const T* box2, - const bool normalized) { - if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || - box2[3] < box1[1]) { - return static_cast(0.); - } else { - const T inter_xmin = std::max(box1[0], box2[0]); - const T inter_ymin = std::max(box1[1], box2[1]); - const T inter_xmax = std::min(box1[2], box2[2]); - const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; - const T inter_area = inter_w * inter_h; - const T bbox1_area = BBoxArea(box1, normalized); - const T bbox2_area = BBoxArea(box2, normalized); - return inter_area / (bbox1_area + bbox2_area - inter_area); - } -} - -template -static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size, - const bool normalized) { - T bbox1_area = math::PolyArea(box1, box_size, normalized); - T bbox2_area = math::PolyArea(box2, box_size, normalized); - T inter_area = math::PolyOverlapArea(box1, box2, box_size, normalized); - if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) { - // If coordinate values are is invalid - // if area size <= 0, return 0. - return static_cast(0.); - } else { - return inter_area / (bbox1_area + bbox2_area - inter_area); - } -} - -template -static inline void NMSFast(const framework::Tensor& bbox, - const framework::Tensor& scores, - const T score_threshold, const T nms_threshold, - const T eta, const int64_t top_k, - std::vector* selected_indices) { - // The total boxes for each instance. - int64_t num_boxes = bbox.dims()[0]; - // 4: [xmin ymin xmax ymax] - int64_t box_size = bbox.dims()[1]; - - std::vector scores_data(num_boxes); - std::copy_n(scores.data(), num_boxes, scores_data.begin()); - std::vector> sorted_indices; - GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); - - selected_indices->clear(); - T adaptive_threshold = nms_threshold; - const T* bbox_data = bbox.data(); - - while (sorted_indices.size() != 0) { - const int idx = sorted_indices.front().second; - bool keep = true; - for (size_t k = 0; k < selected_indices->size(); ++k) { - if (keep) { - const int kept_idx = (*selected_indices)[k]; - T overlap = T(0.); - if (box_size == 4) { - overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, true); - } else { - overlap = PolyIoU(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, box_size, true); - } - keep = overlap <= adaptive_threshold; - } else { - break; - } - } - if (keep) { - selected_indices->push_back(idx); - } - sorted_indices.erase(sorted_indices.begin()); - if (keep && eta < 1 && adaptive_threshold > 0.5) { - adaptive_threshold *= eta; - } - } -} - -template -void MultiClassNMS(const framework::Tensor& scores, - const framework::Tensor& bboxes, - std::map>* indices, int* num_nmsed_out, - const int& background_label, const int& nms_top_k, - const int& keep_top_k, const T& nms_threshold, - const T& nms_eta, const T& score_threshold) { - int64_t class_num = scores.dims()[0]; - int64_t predict_dim = scores.dims()[1]; - int num_det = 0; - for (int64_t c = 0; c < class_num; ++c) { - if (c == background_label) continue; - framework::Tensor score = scores.Slice(c, c + 1); - /// [c] is key - NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, - nms_top_k, &((*indices)[c])); - num_det += (*indices)[c].size(); - } - - *num_nmsed_out = num_det; - const T* scores_data = scores.data(); - if (keep_top_k > -1 && num_det > keep_top_k) { - std::vector>> score_index_pairs; - for (const auto& it : *indices) { - int label = it.first; - const T* sdata = scores_data + label * predict_dim; - const std::vector& label_indices = it.second; - for (size_t j = 0; j < label_indices.size(); ++j) { - int idx = label_indices[j]; - // PADDLE_ENFORCE_LT(idx, predict_dim); - score_index_pairs.push_back( - std::make_pair(sdata[idx], std::make_pair(label, idx))); - } - } - // Keep top k results per image. - std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), - SortScorePairDescend>); - score_index_pairs.resize(keep_top_k); - - // Store the new indices. - std::map> new_indices; - for (size_t j = 0; j < score_index_pairs.size(); ++j) { - int label = score_index_pairs[j].second.first; - int idx = score_index_pairs[j].second.second; - new_indices[label].push_back(idx); - } - new_indices.swap(*indices); - *num_nmsed_out = keep_top_k; - } -} - -template -void MultiClassOutput(const framework::Tensor& scores, - const framework::Tensor& bboxes, - const std::map>& selected_indices, - framework::Tensor* outs) { - int predict_dim = scores.dims()[1]; - int box_size = bboxes.dims()[1]; - int out_dim = bboxes.dims()[1] + 2; - auto* scores_data = scores.data(); - auto* bboxes_data = bboxes.data(); - auto* odata = outs->data(); - - int count = 0; - for (const auto& it : selected_indices) { - /// one batch - int label = it.first; - const T* sdata = scores_data + label * predict_dim; - const std::vector& indices = it.second; - for (size_t j = 0; j < indices.size(); ++j) { - int idx = indices[j]; - const T* bdata = bboxes_data + idx * box_size; - odata[count * out_dim] = label; // label - odata[count * out_dim + 1] = sdata[idx]; // score - // xmin, ymin, xmax, ymax - std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); - count++; - } - } -} - -template -void MultiClassNMSCompute(const MultiClassNMSParam& param, - cl_context context, cl_command_queue commandQueue, - cl_kernel kernel0, cl_kernel kernel1) { - auto* input_bboxes_image = param.InputBBoxes(); - auto& input_bboxes_dims = input_bboxes_image->dims(); - Tensor* input_bboxes = new Tensor(); - input_bboxes->Resize(input_bboxes_dims); - input_bboxes->mutable_data(); - DLOG << "yangfei20"; - framework::CLImageToTensor(input_bboxes_image, input_bboxes, context, - commandQueue, kernel0); - DLOG << "yangfei20"; - auto* input_scores_image = param.InputScores(); - auto& input_scores_dims = input_scores_image->dims(); - - Tensor* input_scores = new Tensor(); - input_scores->Resize(input_scores_dims); - input_scores->mutable_data(); - framework::CLImageToTensor(input_scores_image, input_scores, context, - commandQueue, kernel0); - DLOG << "yangfei20"; - auto outs_image = param.Out(); - Tensor* outs = new Tensor(); - outs->Resize(outs_image->dims()); - outs->mutable_data(); - DLOG << *input_bboxes; - DLOG << *input_scores; - DLOG << *outs; - auto background_label = param.BackGroundLabel(); - auto nms_top_k = param.NMSTopK(); - auto keep_top_k = param.KeepTopK(); - auto nms_threshold = param.NMSThreshold(); - auto nms_eta = param.NMSEta(); - auto score_threshold = param.ScoreThreshold(); - - int64_t batch_size = input_scores_dims[0]; - int64_t class_num = input_scores_dims[1]; - int64_t predict_dim = input_scores_dims[2]; - int64_t box_dim = input_bboxes_dims[2]; - - std::vector>> all_indices; - std::vector batch_starts = {0}; - for (int64_t i = 0; i < batch_size; ++i) { - framework::Tensor ins_score = input_scores->Slice(i, i + 1); - ins_score.Resize({class_num, predict_dim}); - - framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1); - ins_boxes.Resize({predict_dim, box_dim}); - - std::map> indices; - int num_nmsed_out = 0; - MultiClassNMS(ins_score, ins_boxes, &indices, &num_nmsed_out, - background_label, nms_top_k, keep_top_k, nms_threshold, - nms_eta, score_threshold); - all_indices.push_back(indices); - batch_starts.push_back(batch_starts.back() + num_nmsed_out); - } - - int num_kept = batch_starts.back(); - if (num_kept == 0) { - float* od = outs->mutable_data({1}); - od[0] = -1; - } else { - int64_t out_dim = box_dim + 2; - outs->mutable_data({num_kept, out_dim}); - for (int64_t i = 0; i < batch_size; ++i) { - framework::Tensor ins_score = input_scores->Slice(i, i + 1); - ins_score.Resize({class_num, predict_dim}); - - framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1); - ins_boxes.Resize({predict_dim, box_dim}); - - int64_t s = batch_starts[i]; - int64_t e = batch_starts[i + 1]; - if (e > s) { - framework::Tensor out = outs->Slice(s, e); - MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out); - } - } - } - DLOG << "yangfei20"; - outs_image->InitEmptyImage(context, commandQueue, outs->dims()); - framework::TensorToCLImage(outs, outs_image, context, commandQueue, kernel1); - DLOG << *outs; - delete (input_bboxes); - delete (input_scores); - delete (outs); - DLOG << "yangfei20"; -} -template <> -void MultiClassNMSKernel::Compute( - const MultiClassNMSParam& param) { - auto kernel0 = this->cl_helper_.KernelAt(0); - auto kernel1 = this->cl_helper_.KernelAt(1); - MultiClassNMSCompute(param, this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), kernel0, - kernel1); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp b/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp deleted file mode 100644 index 285602757b..0000000000 --- a/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NEAREST_INTERP_OP - -#include - -namespace paddle_mobile { -namespace operators { -template <> -bool NearestInterpolationKernel::Init( - paddle_mobile::operators::NearestInterpolationParam - *param) { - this->cl_helper_.AddKernel("nearest_interp", "nearest_interp_kernel.cl"); - return true; -} - -template <> -void NearestInterpolationKernel::Compute( - const paddle_mobile::operators::NearestInterpolationParam< - paddle_mobile::GPU_CL> ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); - auto input = param.InputX(); - cl_mem input_image = input->GetCLImage(); - auto output = param.Out(); - cl_mem output_image = output->GetCLImage(); - float scale_h = output->dims()[2] / input->dims()[2]; - float scale_w = output->dims()[3] / input->dims()[3]; - int in_dims_h = input->dims()[2]; - int out_dims_h = output->dims()[2]; - int in_dims_w = input->dims()[3]; - int out_dims_w = output->dims()[3]; - - cl_int status; - - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_h); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 5, sizeof(int), &out_dims_h); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 6, sizeof(int), &in_dims_w); - CL_CHECK_ERRORS(status) - status = clSetKernelArg(kernel, 7, sizeof(int), &out_dims_w); - CL_CHECK_ERRORS(status) - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status) -} -template class NearestInterpolationKernel; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/pad2d_kernel.cpp b/mobile/src/operators/kernel/cl/pad2d_kernel.cpp deleted file mode 100644 index 3999995b4a..0000000000 --- a/mobile/src/operators/kernel/cl/pad2d_kernel.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PAD2D_OP - -#include "operators/kernel/pad2d_kernel.h" -#include "framework/cl/cl_tensor.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Pad2DKernel::Init(Pad2DParam *param) { - DLOG << "Init pad2d"; - this->cl_helper_.AddKernel("pad2d", "pad2d_kernel.cl"); - return true; -} - -template <> -void Pad2DKernel::Compute(const Pad2DParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); - cl_int status; - auto output = param.Out(); - auto input = param.InputX(); - auto output_image = output->GetCLImage(); - auto input_image = input->GetCLImage(); - const int out_H = output->dims()[2]; - const int out_W = output->dims()[3]; - const int input_H = input->dims()[2]; - const int input_W = input->dims()[3]; - const auto &paddings = param.paddings_; - const int pad_top = paddings[0]; - const int pad_bottom = paddings[1]; - const int pad_left = paddings[2]; - const int pad_right = paddings[3]; - const float pad_value = param.pad_value_; - const auto &modeStr = param.mode_; - int mode = 0; - if (modeStr == "reflect") { - mode = 1; - } else if (modeStr == "edge") { - mode = 2; - } - DLOG << "input_H: " << input_H; - status = clSetKernelArg(kernel, 0, sizeof(cl_int), &input_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_int), &input_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_bottom); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_int), &pad_left); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_int), &pad_right); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(cl_int), &mode); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(cl_float), &pad_value); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - - CL_CHECK_ERRORS(status); -} - -template class Pad2DKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif // PAD2D_OP diff --git a/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp b/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp deleted file mode 100644 index faa90f9c43..0000000000 --- a/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PIXEL_SHUFFLE_OP - -#include "operators/kernel/pixel_shuffle_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool PixelShuffleKernel::Init(PixelShuffleParam *param) { - this->cl_helper_.AddKernel("pixel_shuffle", "pixel_shuffle_kernel.cl"); - return true; -} - -template <> -void PixelShuffleKernel::Compute( - const PixelShuffleParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - - auto input_image = param.InputX()->GetCLImage(); - auto output_image = param.Out()->GetCLImage(); - auto upscale_factor = param.upscale_factor(); - - int input_n = param.InputX()->dims()[0]; - int input_c = param.InputX()->dims()[1]; - int input_h = param.InputX()->dims()[2]; - int input_w = param.InputX()->dims()[3]; - int output_n = param.Out()->dims()[0]; - int output_c = param.Out()->dims()[1]; - int output_h = param.Out()->dims()[2]; - int output_w = param.Out()->dims()[3]; - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &input_n); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &input_h); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &input_w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(int), &output_n); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &output_c); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(int), &output_h); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(int), &output_w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &upscale_factor); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/pool_kernel.cpp b/mobile/src/operators/kernel/cl/pool_kernel.cpp deleted file mode 100644 index 990f6ea675..0000000000 --- a/mobile/src/operators/kernel/cl/pool_kernel.cpp +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#include "operators/kernel/pool_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool PoolKernel::Init(PoolParam *param) { - std::string pooling_type = param->PoolingType(); - this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl"); - return true; -} - -template <> -void PoolKernel::Compute(const PoolParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - - auto input = param.Input()->GetCLImage(); - auto out = param.Output()->GetCLImage(); - - framework::CLImageConverterFolder *input_folder_converter = - reinterpret_cast( - param.Input()->Converter()); - framework::CLImageConverterFolder *output_folder_converter = - reinterpret_cast( - param.Output()->Converter()); - - const int in_height = input_folder_converter->HeightOfOneBlock(); - const int in_width = input_folder_converter->WidthOfOneBlock(); - const int out_height = output_folder_converter->HeightOfOneBlock(); - const int out_width = output_folder_converter->WidthOfOneBlock(); - - std::string pooling_type = param.PoolingType(); - std::vector ksize = param.Ksize(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - - if (param.isGlobalPooling()) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(param.Input()->dims()[i + 2]); - } - } - - const int pad_top = paddings[0]; - const int pad_left = paddings[1]; - const int stride_h = strides[0]; - const int stride_w = strides[1]; - const int ksize_h = ksize[0]; - const int ksize_w = ksize[1]; - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(cl_mem), &out); - CL_CHECK_ERRORS(status); - - // cl_event out_event = param.Output()->GetClEvent(); - // cl_event wait_event = param.Input()->GetClEvent(); - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -template class PoolKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/prior_box_kernel.cpp b/mobile/src/operators/kernel/cl/prior_box_kernel.cpp deleted file mode 100644 index c10bfed8d1..0000000000 --- a/mobile/src/operators/kernel/cl/prior_box_kernel.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PRIORBOX_OP - -#include "operators/kernel/prior_box_kernel.h" -#include "framework/cl/cl_tensor.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool PriorBoxKernel::Init(PriorBoxParam *param) { - this->cl_helper_.AddKernel("prior_box", "prior_box_kernel.cl"); - return true; -} - -template <> -void PriorBoxKernel::Compute( - const PriorBoxParam ¶m) { - const auto *input_ = param.Input(); - const auto &input_dims = input_->dims(); - - const auto &input_image_dims = param.InputImage()->dims(); - - const auto &min_sizes = param.MinSizes(); - const auto &max_sizes = param.MaxSizes(); - const auto &variances = param.Variances(); - const auto &input_aspect_ratio = param.AspectRatios(); - const bool &flip = param.Flip(); - const bool &clip = param.Clip(); - int isclip = 0; - if (clip) { - isclip = 1; - } - const float &step_w = param.StepW(); - const float &step_h = param.StepH(); - const float &offset = param.Offset(); - const int C = param.OutputBoxes()->dims()[1]; - - auto output_boxes = param.OutputBoxes()->GetCLImage(); - auto output_variances = param.OutputVariances()->GetCLImage(); - - std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); - - auto img_width = input_image_dims[3]; - auto img_height = input_image_dims[2]; - - auto feature_width = input_dims[3]; - auto feature_height = input_dims[2]; - - float step_width, step_height; - /// 300 / 19 - if (step_w == 0 || step_h == 0) { - step_width = static_cast(img_width) / feature_width; - step_height = static_cast(img_height) / feature_height; - } else { - step_width = step_w; - step_height = step_h; - } - - int num_priors = aspect_ratios.size() * min_sizes.size(); - if (!max_sizes.empty()) { - num_priors += max_sizes.size(); - } - - float *box_width = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * num_priors)); - float *box_height = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * num_priors)); - float *variancesptr = - static_cast(paddle_mobile::memory::Alloc(sizeof(float) * 4)); - int idx = 0; - for (size_t s = 0; s < min_sizes.size(); ++s) { - auto min_size = min_sizes[s]; - if (param.MinMaxAspectRatiosOrder()) { - box_width[idx] = box_height[idx] = min_size / 2.; - idx++; - if (max_sizes.size() > 0) { - auto max_size = max_sizes[s]; - box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.; - idx++; - } - for (float ar : aspect_ratios) { - if (fabs(ar - 1.) < 1e-6) { - continue; - } - box_width[idx] = min_size * sqrt(ar) / 2.; - box_height[idx] = min_size / sqrt(ar) / 2.; - idx++; - } - - } else { - for (float ar : aspect_ratios) { - box_width[idx] = min_size * sqrt(ar) / 2.; - box_height[idx] = min_size / sqrt(ar) / 2.; - idx++; - } - if (!max_sizes.empty()) { - auto max_size = max_sizes[s]; - box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.; - idx++; - } - } - } - for (int i = 0; i < variances.size(); i++) { - variancesptr[i] = variances[i]; - } - cl_int status; - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = - this->cl_helper_.DefaultWorkSize(*param.OutputBoxes()); - auto c_block = default_work_size[0]; - auto w = default_work_size[1]; - auto nh = default_work_size[2]; - - std::vector box_shape({num_priors}); - framework::DDim ddim = framework::make_ddim(box_shape); - - framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - box_width_cl_tensor.Resize(ddim); - cl_mem box_width_Buffer = - box_width_cl_tensor.mutable_with_data(box_width); - - framework::CLTensor box_height_cl_tensor(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - box_height_cl_tensor.Resize(ddim); - cl_mem box_height_Buffer = - box_height_cl_tensor.mutable_with_data(box_height); - - framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - - std::vector variances_shape({4}); - framework::DDim vddim = framework::make_ddim(variances_shape); - - variances_cl_tensor.Resize(vddim); - cl_mem variances_Buffer = - variances_cl_tensor.mutable_with_data(variancesptr); - - // DLOG << "c_block:" << c_block; - // DLOG << "w:" << w; - // DLOG << "nh:" << nh; - // DLOG << "step_width:" << step_width; - // DLOG << "step_height:" << step_height; - // DLOG << "offset:" << offset; - // DLOG << "img_width:" << img_width; - // DLOG << "img_height:" << img_height; - // DLOG << "num_priors:" << num_priors; - // DLOG << "C:" << C; - // DLOG << "isclip:" << isclip; - // printf("param.MinMaxAspectRatiosOrder() = - // %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i < - // num_priors; i++) { - // DLOG << box_width[i]; - // DLOG << box_height[i]; - // } - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &box_width_Buffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(float), &step_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(float), &step_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(float), &offset); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &img_width); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &img_height); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 14, sizeof(int), &C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 15, sizeof(int), &isclip); - CL_CHECK_ERRORS(status); - size_t global_work_size[2] = {c_block, nh}; - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - - paddle_mobile::memory::Free(box_width); - paddle_mobile::memory::Free(box_height); - paddle_mobile::memory::Free(variancesptr); -} -template class PriorBoxKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/relu6_kernel.cpp b/mobile/src/operators/kernel/cl/relu6_kernel.cpp deleted file mode 100644 index 20a6d9815b..0000000000 --- a/mobile/src/operators/kernel/cl/relu6_kernel.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef RELU_OP - -#include "operators/kernel/activation_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Relu6Kernel::Init(Relu6Param* param) { - this->cl_helper_.AddKernel("relu6", "relu6.cl"); - return true; -} - -template <> -void Relu6Kernel::Compute(const Relu6Param& param) { - auto kernel = this->cl_helper_.KernelAt(0); - const auto* input = param.InputX(); - auto* output = param.Out(); - float threshold = param.getThreshold(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(float), &threshold); - CL_CHECK_ERRORS(status); - const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; - - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, - work_size, NULL, 0, NULL, NULL); -} - -template class Relu6Kernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/relu_kernel.cpp b/mobile/src/operators/kernel/cl/relu_kernel.cpp deleted file mode 100644 index f166963d94..0000000000 --- a/mobile/src/operators/kernel/cl/relu_kernel.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef RELU_OP - -#include "operators/kernel/activation_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ReluKernel::Init(ReluParam* param) { - this->cl_helper_.AddKernel("relu", "relu.cl"); - // this->cl_helper_.AddKernel("relu_p0", "relu.cl"); - // this->cl_helper_.AddKernel("relu_p1", "relu.cl"); - // const auto dim = - // const_cast(param->InputX())->ImageDims(); - // param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(), - // this->cl_helper_.CLCommandQueue(), - // dim); - return true; -} - -template <> -void ReluKernel::Compute(const ReluParam& param) { - auto kernel = this->cl_helper_.KernelAt(0); - // auto kernel_p0 = this->cl_helper_.KernelAt(1); - // auto kernel_p1 = this->cl_helper_.KernelAt(2); - const auto* input = param.InputX(); - auto* output = param.Out(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - // auto tImage = - // const_cast&>(param).getMidImage().GetCLImage(); - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - // clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage); - // clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage); - // clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage); - // clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage); - const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; - - // cl_event out_event = param.Out()->GetClEvent(); - // cl_event wait_event = param.InputX()->GetClEvent(); - - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3, - // NULL, - // work_size, NULL, 0, NULL, NULL); -} - -template class ReluKernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/reshape2_kernel.cpp b/mobile/src/operators/kernel/cl/reshape2_kernel.cpp deleted file mode 100644 index 7dbea06a51..0000000000 --- a/mobile/src/operators/kernel/cl/reshape2_kernel.cpp +++ /dev/null @@ -1,150 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef RESHAPE2_OP - -#include "operators/kernel/reshape2_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Reshape2Kernel::Init(Reshape2Param *param) { - this->cl_helper_.AddKernel("reshape", "reshape.cl"); - return true; -} - -inline framework::DDim ValidateShape(const std::vector shape, - const framework::DDim &in_dims) { - const int64_t in_size = framework::product(in_dims); - // only one dimension can be set to -1, whose size will be automatically - // infered. - const int64_t unk_dim_val = -1; - const int64_t copy_dim_val = 0; - - std::vector output_shape(shape.size(), 0); - int64_t capacity = 1; - int unk_dim_idx = -1; - for (size_t i = 0; i < shape.size(); ++i) { - if (shape[i] == unk_dim_val) { - PADDLE_MOBILE_ENFORCE( - unk_dim_idx == -1, - "Only one input dimension of Attr(shape) can be unknown."); - unk_dim_idx = i; - } else if (shape[i] == copy_dim_val) { - PADDLE_MOBILE_ENFORCE( - static_cast(i) < in_dims.size(), - "The index of dimension to copy from input shape must be less " - "than the size of input shape."); - } else { - PADDLE_MOBILE_ENFORCE( - shape[i] > 0, - "Each input dimension of Attr(shape) must not be negtive except " - "one unknown dimension."); - } - - capacity *= (shape[i] ? shape[i] : in_dims[i]); - output_shape[i] = (shape[i] ? static_cast(shape[i]) : in_dims[i]); - } - - if (unk_dim_idx != -1) { - output_shape[unk_dim_idx] = -in_size / capacity; - PADDLE_MOBILE_ENFORCE(output_shape[unk_dim_idx] * capacity == -in_size, - "Invalid shape is given."); - } else { - PADDLE_MOBILE_ENFORCE(capacity == in_size, "Invalid shape is given."); - } - return framework::make_ddim(output_shape); -} - -template <> -void Reshape2Kernel::Compute( - const Reshape2Param ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - const auto *input = param.InputX(); - auto *output = param.Out(); - auto input_image = input->GetCLImage(); - auto output_image = output->GetCLImage(); - const auto &inputDim = input->dims(); - const auto &outputDim = output->dims(); - int input_dims[4] = {1, 1, 1, 1}; - int output_dims[4] = {1, 1, 1, 1}; - // 1 1000 1 1 - for (int i = 0; i < inputDim.size(); i++) { - input_dims[4 - inputDim.size() + i] = inputDim[i]; - } - - // 1 1 1 1000 - for (int i = 0; i < outputDim.size(); i++) { - output_dims[4 - outputDim.size() + i] = outputDim[i]; - } - - int out_C = output_dims[1]; - int out_H = output_dims[2]; - int out_W = output_dims[3]; - int in_W = input_dims[3]; - int in_H = input_dims[2]; - int in_Stride0 = in_W; - int in_Stride1 = input_dims[2] * input_dims[3]; - int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3]; - int out_Stride0 = out_W; - int out_Stride1 = out_H * out_W; - int out_Stride2 = out_C * out_H * out_W; - DLOG << "out_C=" << out_C; - DLOG << "out_H=" << out_H; - DLOG << "out_W=" << out_W; - DLOG << "in_W=" << in_W; - DLOG << "default_work_size=" << default_work_size; - DLOG << "in_Stride0=" << in_Stride0; - DLOG << "in_Stride1=" << in_Stride1; - DLOG << "out_Stride0=" << out_Stride0; - DLOG << "out_Stride1=" << out_Stride1; - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(int), &in_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -template class Reshape2Kernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/reshape_kernel.cpp b/mobile/src/operators/kernel/cl/reshape_kernel.cpp deleted file mode 100644 index 18d98b0ff9..0000000000 --- a/mobile/src/operators/kernel/cl/reshape_kernel.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef RESHAPE_OP - -#include "operators/kernel/reshape_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ReshapeKernel::Init(ReshapeParam *param) { - this->cl_helper_.AddKernel("reshape", "reshape.cl"); - return true; -} - -template <> -void ReshapeKernel::Compute(const ReshapeParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - const auto *input = param.InputX(); - auto *output = param.Out(); - auto input_image = input->GetCLImage(); - auto output_image = output->GetCLImage(); - const auto &inputDim = input->dims(); - const auto &outputDim = output->dims(); - int input_dims[4] = {1, 1, 1, 1}; - int output_dims[4] = {1, 1, 1, 1}; - // 1 1000 1 1 - for (int i = 0; i < inputDim.size(); i++) { - input_dims[4 - inputDim.size() + i] = inputDim[i]; - } - - // 1 1 1 1000 - for (int i = 0; i < outputDim.size(); i++) { - output_dims[4 - outputDim.size() + i] = outputDim[i]; - } - - int out_C = output_dims[1]; - int out_H = output_dims[2]; - int out_W = output_dims[3]; - int in_W = input_dims[3]; - int in_H = input_dims[2]; - int in_Stride0 = in_W; - int in_Stride1 = input_dims[2] * input_dims[3]; - int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3]; - int out_Stride0 = out_W; - int out_Stride1 = out_H * out_W; - int out_Stride2 = out_C * out_H * out_W; - DLOG << "out_C=" << out_C; - DLOG << "out_H=" << out_H; - DLOG << "out_W=" << out_W; - DLOG << "in_W=" << in_W; - DLOG << "default_work_size=" << default_work_size; - DLOG << "in_Stride0=" << in_Stride0; - DLOG << "in_Stride1=" << in_Stride1; - DLOG << "out_Stride0=" << out_Stride0; - DLOG << "out_Stride1=" << out_Stride1; - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(int), &in_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -template class ReshapeKernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/scale_kernel.cpp b/mobile/src/operators/kernel/cl/scale_kernel.cpp deleted file mode 100644 index 4ab2be7c3f..0000000000 --- a/mobile/src/operators/kernel/cl/scale_kernel.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SCALE_OP - -#include "operators/kernel/scale_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ScaleKernel::Init(ScaleParam* param) { - this->cl_helper_.AddKernel("scale", "scale_kernel.cl"); - return true; -} - -template <> -void ScaleKernel::Compute(const ScaleParam& param) { - auto kernel = this->cl_helper_.KernelAt(0); - const auto* input = param.InputX(); - auto* output = param.Out(); - const float scale = param.Scale(); - const float bias = param.Bias(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - int out_width = (output->dims().size() == 4) ? output->dims()[3] : 1; - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(float), &scale); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(float), &bias); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &out_width); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -template class ScaleKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp b/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp deleted file mode 100644 index 33ce051f4a..0000000000 --- a/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef SIGMOID_OP - -#include "operators/kernel/activation_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SigmoidKernel::Init(SigmoidParam* param) { - this->cl_helper_.AddKernel("sigmoid", "sigmoid.cl"); - return true; -} - -template <> -void SigmoidKernel::Compute(const SigmoidParam& param) { - auto kernel = this->cl_helper_.KernelAt(0); - const auto* input = param.InputX(); - auto* output = param.Out(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -template class SigmoidKernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/slice_kernel.cpp b/mobile/src/operators/kernel/cl/slice_kernel.cpp deleted file mode 100644 index 446d003219..0000000000 --- a/mobile/src/operators/kernel/cl/slice_kernel.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SLICE_OP - -#include -#include - -namespace paddle_mobile { -namespace operators { -template <> -bool SliceKernel::Init( - paddle_mobile::operators::SliceParam *param) { - this->cl_helper_.AddKernel("slice", "slice_kernel.cl"); - return true; -} - -template <> -void SliceKernel::Compute( - const paddle_mobile::operators::SliceParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.output_); - auto input = param.input_; - cl_mem input_image = input->GetCLImage(); - auto output = param.output_; - cl_mem output_image = output->GetCLImage(); - int starts_0 = param.starts_[0]; - int ends_0 = param.ends_[0]; - int axes_0 = param.axes_[0] - (param.original_output_dims_size_ - - param.output_->dims().size()); - int dims_w = input->dims()[axes_0 + 2]; - - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &starts_0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &ends_0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &dims_w); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} -template class SliceKernel; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/softmax_kernel.cpp b/mobile/src/operators/kernel/cl/softmax_kernel.cpp deleted file mode 100644 index 6447b68d33..0000000000 --- a/mobile/src/operators/kernel/cl/softmax_kernel.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#include "operators/kernel/softmax_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SoftmaxKernel::Init(SoftmaxParam *param) { - this->cl_helper_.AddKernel("softmax", "softmax.cl"); - return true; -} - -template <> -void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); - const auto *input = param.InputX(); - auto *output = param.Out(); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - const auto &outputDim = output->dims(); - - int dims[4] = {1, 1, 1, 1}; - - for (int i = 0; i < outputDim.size(); i++) { - dims[4 - outputDim.size() + i] = outputDim[i]; - } - - const int out_W = dims[3]; - - cl_int status; - - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - - CL_CHECK_ERRORS(status); -} - -template class SoftmaxKernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/split_kernel.cpp b/mobile/src/operators/kernel/cl/split_kernel.cpp deleted file mode 100644 index 58c7361bc5..0000000000 --- a/mobile/src/operators/kernel/cl/split_kernel.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SPLIT_OP - -#include "operators/kernel/split_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SplitKernel::Init(SplitParam* param) { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); - this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); - return true; -} - -// Strided numel memory copy from src to dst by the specified axis -// -// For example, for a tensor dims [4, 20, 100], the strieded numel is -// [8000, 2000, 100] -// -// NOTE: The src and dst tensor should have the same elements -// except the specified axis. -template -void StridedNumelCopyWithAxis(int64_t axis, T* dst, - const framework::DDim& dst_stride_numel, - const T* src, - const framework::DDim& src_stride_numel, - int64_t size) { - int64_t before = dst_stride_numel[0] / dst_stride_numel[axis]; - int64_t src_after = src_stride_numel[axis]; - int64_t dst_after = dst_stride_numel[axis]; - - PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(), - "src and dst tensor should have the same dims size."); - - for (int64_t i = 0; i < axis; ++i) { - if (i < axis) { - PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] == - dst_stride_numel[i] / dst_stride_numel[axis], - "src and dst should have the same elements " - "except the specified axis."); - } else if (i == axis) { - continue; - } else { - PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i], - "src and dst should have the same elements " - "except the specified axis."); - } - } - - for (int64_t i = 0; i < before; ++i) { - memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size); - } -} - -template <> -void SplitKernel::Compute(const SplitParam& param) { - auto kernel0 = this->cl_helper_.KernelAt(0); - auto kernel1 = this->cl_helper_.KernelAt(1); - auto* input_image = param.InputX(); - auto in_stride = framework::stride_numel(input_image->dims()); - auto input_dims = input_image->dims(); - auto outs_images = param.Outs(); - int64_t axis = param.Axis(); - - Tensor* input_tensor = new Tensor(); - input_tensor->Resize(input_image->dims()); - input_tensor->mutable_data(); - - framework::CLImageToTensor(input_image, input_tensor, - this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), kernel0); - - size_t input_offset = 0; - for (auto out : outs_images) { - auto out_stride = framework::stride_numel(out->dims()); - - Tensor* temp_out = new Tensor(); - temp_out->Resize(out->dims()); - temp_out->mutable_data(); - framework::CLImageToTensor(out, temp_out, this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), kernel0); - StridedNumelCopyWithAxis(axis, temp_out->data(), out_stride, - input_tensor->data() + input_offset, - in_stride, out_stride[axis]); - input_offset += out_stride[axis]; - out->InitEmptyImage(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), temp_out->dims()); - framework::TensorToCLImage(temp_out, out, this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), kernel1); - outs_images.push_back(out); - - delete (temp_out); - } - delete (input_tensor); -} - -template class SplitKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/tanh_kernel.cpp b/mobile/src/operators/kernel/cl/tanh_kernel.cpp deleted file mode 100644 index 5c63a3606d..0000000000 --- a/mobile/src/operators/kernel/cl/tanh_kernel.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef TANH_OP - -#include "operators/kernel/activation_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool TanhKernel::Init(TanhParam* param) { - this->cl_helper_.AddKernel("tanh_kernel", "tanh_kernel.cl"); - return true; -} - -template <> -void TanhKernel::Compute(const TanhParam& param) { - auto kernel = this->cl_helper_.KernelAt(0); - const auto* input = param.InputX(); - auto* output = param.Out(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - CL_CHECK_ERRORS(status); - const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; - - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); -} - -template class TanhKernel; - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp b/mobile/src/operators/kernel/cl/transpose2_kernel.cpp deleted file mode 100644 index 248eb3d12e..0000000000 --- a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp +++ /dev/null @@ -1,219 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef TRANSPOSE2_OP - -#include "operators/kernel/transpose2_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Transpose2Kernel::Init(Transpose2Param *param) { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); - this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); - return true; -} - -inline bool IsShuffleChannel(const std::vector &axis) { - bool is_shuffle_channel = true; - if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) { - for (int i = 3; i < axis.size(); ++i) { - if (axis[i] != i) { - is_shuffle_channel = false; - break; - } - } - } else { - return false; - } - return is_shuffle_channel; -} - -template -void ShuffleChannelCompute(const Transpose2Param ¶m, - cl_context context, cl_command_queue commandQueue, - cl_kernel kernel0, cl_kernel kernel1) { - auto axis = param.Axis(); - int axis_size = axis.size(); - - bool shouldResize = true; - int diff_dim = 0; - if (axis_size > 4) { - for (int i = 0; i < axis_size - 4; ++i) { - if (axis[i] != i) { - shouldResize = false; - break; - } else { - diff_dim++; - } - } - if (shouldResize) { - std::vector temp_axis_dims; - temp_axis_dims.reserve(static_cast(4)); - for (int i = axis_size - 4; i < axis_size; ++i) { - temp_axis_dims.push_back(axis[i] - diff_dim); - } - axis.resize(4); - axis.clear(); - axis.insert(axis.begin(), temp_axis_dims.begin(), temp_axis_dims.end()); - } - } - - auto input = param.InputX(); - Tensor *input_tensor = new Tensor(); - input_tensor->Resize(input->dims()); - input_tensor->mutable_data(); - - framework::CLImageToTensor(input, input_tensor, context, commandQueue, - kernel0); - const Dtype *input_ptr = input_tensor->data(); - - auto output = param.Out(); - Tensor *output_tensor = new Tensor(); - framework::DDim out_dims(input->dims()); - for (size_t i = 0; i < axis_size; i++) { - out_dims[i] = input->dims()[axis[i]]; - } - output_tensor->Resize(out_dims); - output_tensor->mutable_data(); - Dtype *output_ptr = output_tensor->mutable_data(); - // input and output's shape dimension must >= 2 && <= 6. - const framework::DDim &in_dim = input->dims(); - const framework::DDim &out_dim = output->dims(); - size_t offset = 1; - for (int i = 3; i < axis.size(); ++i) { - offset *= in_dim[i]; - } - -#pragma omp parallel for collapse(3) - for (int batch = 0; batch < out_dim[0]; ++batch) { - for (int c1 = 0; c1 < out_dim[1]; ++c1) { - for (int c2 = 0; c2 < out_dim[2]; ++c2) { - size_t out_offset = - ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset; - size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset; - memcpy(output_ptr + out_offset, input_ptr + in_offset, - offset * sizeof(Dtype)); - } - } - } - - output->InitEmptyImage(context, commandQueue, output_tensor->dims()); - framework::TensorToCLImage(output_tensor, output, context, commandQueue, - kernel1); - - delete (input_tensor); - delete (output_tensor); -} - -template -void Transpose2Compute(const Transpose2Param ¶m, cl_context context, - cl_command_queue commandQueue, cl_kernel kernel0, - cl_kernel kernel1) { - const std::vector &axis = param.Axis(); - - auto input = param.InputX(); - Tensor *input_tensor = new Tensor(); - input_tensor->Resize(input->dims()); - input_tensor->mutable_data(); - framework::CLImageToTensor(input, input_tensor, context, commandQueue, - kernel0); - const Dtype *input_ptr = input_tensor->data(); - - auto output = param.Out(); - Tensor *output_tensor = new Tensor(); - output_tensor->Resize(input->dims()); - output_tensor->mutable_data(); - Dtype *output_ptr = output_tensor->mutable_data(); - // input and output's shape dimension must >= 2 && <= 6. - const framework::DDim &in_dim = input->dims(); - const framework::DDim &out_dim = output->dims(); - - // precompute inverted output dim and strides - size_t rout_dim[6], strides[6]; - int permute = axis.size(); // permute must >=2 && <= 6. - for (int i = 0; i < permute; ++i) { - int k = permute - 1 - i; - strides[k] = 1; - for (int j = axis[i] + 1; j < permute; ++j) { - strides[k] *= in_dim[j]; - } - rout_dim[k] = out_dim[i]; - } - // unroll the first 2 dimensions - int reamin_dim = 1; - for (int i = 2; i < out_dim.size(); ++i) { - reamin_dim *= out_dim[i]; - } - -#pragma omp parallel for collapse(2) - for (int batch = 0; batch < out_dim[0]; ++batch) { - for (int j = 0; j < out_dim[1]; ++j) { - size_t offset = batch * strides[permute - 1] + j * strides[permute - 2]; - Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim; - int indics[4] = {0, 0, 0, 0}; - for (int k = 0; k < reamin_dim; ++k) { - out_ptr[k] = input_ptr[offset]; - indics[0] += 1; - offset += strides[0]; - for (int p = 0; p < permute - 3; ++p) { - if (indics[p] == rout_dim[p]) { - indics[p + 1] += 1; - indics[p] = 0; - offset += strides[p + 1]; - offset -= rout_dim[p] * strides[p]; - } else { - break; - } - } - } - } - } - - // output->InitEmptyImage(context, commandQueue, output_tensor->dims()); - framework::TensorToCLImage(output_tensor, output, context, commandQueue, - kernel1); - delete (input_tensor); - delete (output_tensor); -} - -template <> -void Transpose2Kernel::Compute( - const Transpose2Param ¶m) { - auto kernel0 = this->cl_helper_.KernelAt(0); - auto kernel1 = this->cl_helper_.KernelAt(1); - - const std::vector &axis = param.Axis(); - bool shuffle_channel = IsShuffleChannel(axis); - if (shuffle_channel) { - DLOG << "transpose shuffle_channel .. "; - ShuffleChannelCompute(param, this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), kernel0, - kernel1); - } else { - DLOG << "transpose 2 compute .. "; - Transpose2Compute(param, this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), kernel0, - kernel1); - } - - DLOG << "transpose end .. "; -} - -template class Transpose2Kernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/cl/transpose_kernel.cpp b/mobile/src/operators/kernel/cl/transpose_kernel.cpp deleted file mode 100644 index d3133449b9..0000000000 --- a/mobile/src/operators/kernel/cl/transpose_kernel.cpp +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef TRANSPOSE_OP - -#include "operators/kernel/transpose_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool TransposeKernel::Init(TransposeParam *param) { - if (param->Out()->dims().size() == 4) { - this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl"); - } else if (param->Out()->dims().size() < 4) { - this->cl_helper_.AddKernel("transpose", "transpose_kernel.cl"); - } - return true; -} - -template <> -void TransposeKernel::Compute( - const TransposeParam ¶m) { - if (param.Out()->dims().size() == 4) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - int out_C = param.Out()->dims()[1]; - int out_H = param.Out()->dims()[2]; - int out_W = param.Out()->dims()[3]; - int in_W = param.InputX()->dims()[3]; - auto output_image = param.Out()->GetCLImage(); - auto input_image = param.InputX()->GetCLImage(); - DLOG << "out_C=" << out_C; - DLOG << "out_H=" << out_H; - DLOG << "out_W=" << out_W; - DLOG << "in_C=" << in_W; - DLOG << "default_work_size=" << default_work_size; - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else if (param.Out()->dims().size() == 3) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - int out_C = param.Out()->dims()[0]; - int out_H = param.Out()->dims()[1]; - int out_W = param.Out()->dims()[2]; - int in_W = param.InputX()->dims()[2]; - auto output_image = param.Out()->GetCLImage(); - auto input_image = param.InputX()->GetCLImage(); - DLOG << "out_C=" << out_C; - DLOG << "out_H=" << out_H; - DLOG << "out_W=" << out_W; - DLOG << "in_C=" << in_W; - DLOG << "default_work_size=" << default_work_size; - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - - } else if (param.Out()->dims().size() == 2) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); - int out_C = 1; - int out_H = param.Out()->dims()[0]; - int out_W = param.Out()->dims()[1]; - int in_W = param.InputX()->dims()[1]; - auto output_image = param.Out()->GetCLImage(); - auto input_image = param.InputX()->GetCLImage(); - DLOG << "out_C=" << out_C; - DLOG << "out_H=" << out_H; - DLOG << "out_W=" << out_W; - DLOG << "in_C=" << in_W; - DLOG << "default_work_size=" << default_work_size; - cl_int status; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/compare_kernel.h b/mobile/src/operators/kernel/compare_kernel.h deleted file mode 100644 index 8932ca7757..0000000000 --- a/mobile/src/operators/kernel/compare_kernel.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef LESS_THAN_OP -DECLARE_KERNEL(LessThan, CompareParam); -#endif // LESS_THAN_OP - -#ifdef EQUAL_OP -DECLARE_KERNEL(Equal, CompareParam); -#endif // EQUAL_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/concat_kernel.h b/mobile/src/operators/kernel/concat_kernel.h deleted file mode 100644 index ac9ebca4d5..0000000000 --- a/mobile/src/operators/kernel/concat_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONCAT_OP - -#pragma once -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using namespace framework; - -template -class ConcatKernel - : public framework::OpKernelBase> { - public: - void Compute(const ConcatParam ¶m); - bool Init(ConcatParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conditional_block_kernel.h b/mobile/src/operators/kernel/conditional_block_kernel.h deleted file mode 100644 index 851d558c2c..0000000000 --- a/mobile/src/operators/kernel/conditional_block_kernel.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONDITIONAL_BLOCK_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class ConditionalBlockParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ConditionalBlockParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = OpParam::GetMultiVarValue("Input", inputs, *scope); - cond_ = OpParam::GetMultiVarValue("Cond", inputs, *scope); - output_ = OpParam::OutFrom(outputs, *scope); - scope_ = OpParam::GetVar("Scope", outputs, *scope); - is_scalar_condition_ = GetAttr("is_scalar_condition", attrs); - sub_block_ = GetAttr("sub_block", attrs); - } - - const vector Input() const { return input_; } - - const vector Cond() const { return cond_; } - - GType *Output() const { return output_; } - - Variable *OutputScope() const { return scope_; } - - bool isScalarCondition() const { return is_scalar_condition_; } - - framework::BlockDesc *getSubBlock() const { return sub_block_; } - - private: - vector input_; - vector cond_; - GType *output_; - Variable *scope_; - bool is_scalar_condition_; - framework::BlockDesc *sub_block_; -}; - -DECLARE_KERNEL(ConditionalBlock, ConditionalBlockParam); - -} // namespace operators -} // namespace paddle_mobile - -#endif // CONDITIONAL_BLOCK_OP diff --git a/mobile/src/operators/kernel/conv_add_bn_kernel.h b/mobile/src/operators/kernel/conv_add_bn_kernel.h deleted file mode 100644 index 757664eb53..0000000000 --- a/mobile/src/operators/kernel/conv_add_bn_kernel.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVADDBN_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvAddBNKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvAddBNParam ¶m); - bool Init(FusionConvAddBNParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h b/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h deleted file mode 100644 index 2174a6f125..0000000000 --- a/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVADDBNRELU_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvAddBNReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvAddBNReluParam ¶m); - bool Init(FusionConvAddBNReluParam *param); - - private: - bool could_use_faster_depthwise_conv_ = false; - bool use_gemm_add_bn_relu = false; - bool use_slidingwindow_add_bn_relu = false; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_add_kernel.h b/mobile/src/operators/kernel/conv_add_kernel.h deleted file mode 100644 index fd3f279a78..0000000000 --- a/mobile/src/operators/kernel/conv_add_kernel.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP - -#pragma once - -#include -#ifdef __ARM_NEON -#include -#endif -#include "common/common.h" -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvAddKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvAddParam ¶m); - bool Init(FusionConvAddParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_add_relu_kernel.h b/mobile/src/operators/kernel/conv_add_relu_kernel.h deleted file mode 100644 index 8cfc92ef19..0000000000 --- a/mobile/src/operators/kernel/conv_add_relu_kernel.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVADDRELU_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvAddReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvAddReluParam ¶m); - bool Init(FusionConvAddReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h b/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h deleted file mode 100644 index 63a86b5653..0000000000 --- a/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVBNADDRELU_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvBNAddReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvBNAddReluParam ¶m); - bool Init(FusionConvBNAddReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_bn_kernel.h b/mobile/src/operators/kernel/conv_bn_kernel.h deleted file mode 100644 index 1fb0d680cf..0000000000 --- a/mobile/src/operators/kernel/conv_bn_kernel.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVBN_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvBNKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvBNParam ¶m); - bool Init(FusionConvBNParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_bn_relu_kernel.h b/mobile/src/operators/kernel/conv_bn_relu_kernel.h deleted file mode 100644 index aef735a524..0000000000 --- a/mobile/src/operators/kernel/conv_bn_relu_kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVBNRELU_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvBNReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvBNReluParam ¶m); - bool Init(FusionConvBNReluParam *param); - - private: - bool use_gemm_bn_relu = false; - bool use_slidingwindow_bn_relu = false; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_kernel.h b/mobile/src/operators/kernel/conv_kernel.h deleted file mode 100644 index cac498c36b..0000000000 --- a/mobile/src/operators/kernel/conv_kernel.h +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class ConvKernel : public OpKernelBase> { - public: - void Compute(const ConvParam ¶m); - bool Init(ConvParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_relu_kernel.h b/mobile/src/operators/kernel/conv_relu_kernel.h deleted file mode 100644 index 4fb2fe3171..0000000000 --- a/mobile/src/operators/kernel/conv_relu_kernel.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVRELU_OP - -#include -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class ConvReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvReluParam ¶m); - bool Init(FusionConvReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/conv_transpose_kernel.h b/mobile/src/operators/kernel/conv_transpose_kernel.h deleted file mode 100644 index 6341a87d43..0000000000 --- a/mobile/src/operators/kernel/conv_transpose_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_TRANSPOSE_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class ConvTransposeKernel - : public OpKernelBase> { - public: - void Compute(const ConvTransposeParam ¶m); - - bool Init(ConvTransposeParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // PADDLE_MOBILE_DE_CONV_KERNEL_H diff --git a/mobile/src/operators/kernel/crf_kernel.h b/mobile/src/operators/kernel/crf_kernel.h deleted file mode 100644 index 1436aafc06..0000000000 --- a/mobile/src/operators/kernel/crf_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CRF_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class CrfKernel - : public framework::OpKernelBase> { - public: - void Compute(const CrfParam& param); - bool Init(CrfParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/deconv_add_bn_kernel.h b/mobile/src/operators/kernel/deconv_add_bn_kernel.h deleted file mode 100755 index 181367031c..0000000000 --- a/mobile/src/operators/kernel/deconv_add_bn_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDBN_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class DeconvAddBNKernel - : public OpKernelBase> { - public: - void Compute(const FusionDeconvAddBNParam ¶m); - - bool Init(FusionDeconvAddBNParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h b/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h deleted file mode 100755 index c63b4db050..0000000000 --- a/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDBNRELU_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class DeconvAddBNReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionDeconvAddBNReluParam ¶m); - - bool Init(FusionDeconvAddBNReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/deconv_add_kernel.h b/mobile/src/operators/kernel/deconv_add_kernel.h deleted file mode 100644 index 61170f95e2..0000000000 --- a/mobile/src/operators/kernel/deconv_add_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADD_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class DeconvAddKernel - : public OpKernelBase> { - public: - void Compute(const FusionDeconvAddParam ¶m); - - bool Init(FusionDeconvAddParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/deconv_add_relu_kernel.h b/mobile/src/operators/kernel/deconv_add_relu_kernel.h deleted file mode 100644 index dc48272157..0000000000 --- a/mobile/src/operators/kernel/deconv_add_relu_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDRELU_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class DeconvAddReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionDeconvAddReluParam ¶m); - - bool Init(FusionDeconvAddReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/deconv_bn_relu_kernel.h b/mobile/src/operators/kernel/deconv_bn_relu_kernel.h deleted file mode 100755 index 4ab0257b07..0000000000 --- a/mobile/src/operators/kernel/deconv_bn_relu_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVBNRELU_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class DeconvBNReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionDeconvBNReluParam ¶m); - - bool Init(FusionDeconvBNReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/deconv_relu_kernel.h b/mobile/src/operators/kernel/deconv_relu_kernel.h deleted file mode 100644 index bc85f1ffee..0000000000 --- a/mobile/src/operators/kernel/deconv_relu_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVRELU_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class DeconvReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionDeconvReluParam ¶m); - - bool Init(FusionDeconvReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/dequant_bn_kernel.h b/mobile/src/operators/kernel/dequant_bn_kernel.h deleted file mode 100644 index cf759bf69c..0000000000 --- a/mobile/src/operators/kernel/dequant_bn_kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef FUSION_DEQUANT_BN_OP -DECLARE_KERNEL(FusionDequantBN, FusionDequantBNParam); -#endif - -#ifdef FUSION_DEQUANT_BN_RELU_OP -DECLARE_KERNEL(FusionDequantBNRelu, FusionDequantBNParam); -#endif - -#ifdef FUSION_DEQUANT_ADD_BN_OP -DECLARE_KERNEL(FusionDequantAddBN, FusionDequantAddBNParam); -#endif - -#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP -DECLARE_KERNEL(FusionDequantAddBNRelu, FusionDequantAddBNParam); -#endif - -#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP -DECLARE_KERNEL(FusionDequantAddBNQuant, FusionDequantAddBNQuantParam); -#endif - -#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP -DECLARE_KERNEL(FusionDequantAddBNReluQuant, FusionDequantAddBNQuantParam); -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/dequantize_kernel.h b/mobile/src/operators/kernel/dequantize_kernel.h deleted file mode 100644 index 6ba8ec88c5..0000000000 --- a/mobile/src/operators/kernel/dequantize_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DEQUANT_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class DequantizeKernel - : public framework::OpKernelBase> { - public: - void Compute(const DequantizeParam ¶m); - bool Init(DequantizeParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/detection_kernel.h b/mobile/src/operators/kernel/detection_kernel.h deleted file mode 100644 index 89c8348d5b..0000000000 --- a/mobile/src/operators/kernel/detection_kernel.h +++ /dev/null @@ -1,232 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef ANCHOR_GENERATOR_OP -template -class AnchorGeneratorParam : public OpParam { - public: - AnchorGeneratorParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = - OpParam::GetVarValue("Input", inputs, *scope); - output_anchors_ = - OpParam::GetVarValue("Anchors", outputs, *scope); - output_variances_ = OpParam::GetVarValue( - "Variances", outputs, *scope); - - anchor_sizes_ = OpParam::GetAttr>("anchor_sizes", attrs); - aspect_ratios_ = - OpParam::GetAttr>("aspect_ratios", attrs); - variances_ = OpParam::GetAttr>("variances", attrs); - stride_ = OpParam::GetAttr>("stride", attrs); - offset_ = OpParam::GetAttr("offset", attrs); - } - - public: - // input - framework::Tensor *input_; - // outputs - framework::Tensor *output_anchors_; - framework::Tensor *output_variances_; - - std::vector anchor_sizes_; - std::vector aspect_ratios_; - std::vector variances_; - std::vector stride_; - float offset_; -}; - -DECLARE_KERNEL(AnchorGenerator, AnchorGeneratorParam); -#endif - -#ifdef PROPOSAL_OP -template -class ProposalParam : public OpParam { - public: - ProposalParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - scores_ = - OpParam::GetVarValue("Scores", inputs, *scope); - bbox_deltas_ = OpParam::GetVarValue("BboxDeltas", - inputs, *scope); - im_info_ = - OpParam::GetVarValue("ImInfo", inputs, *scope); - anchors_ = - OpParam::GetVarValue("Anchors", inputs, *scope); - variances_ = - OpParam::GetVarValue("Variances", inputs, *scope); - - rpn_rois_ = - OpParam::GetVarValue("RpnRois", outputs, *scope); - rpn_probs_ = OpParam::GetVarValue("RpnRoiProbs", - outputs, *scope); - - pre_nms_topn_ = OpParam::GetAttr("pre_nms_topN", attrs); - post_nms_topn_ = OpParam::GetAttr("post_nms_topN", attrs); - nms_thresh_ = OpParam::GetAttr("nms_thresh", attrs); - min_size_ = OpParam::GetAttr("min_size", attrs); - eta_ = OpParam::GetAttr("eta", attrs); - } - - public: - framework::Tensor *scores_; - framework::Tensor *bbox_deltas_; - framework::Tensor *im_info_; - framework::Tensor *anchors_; - framework::Tensor *variances_; - - std::shared_ptr score_index_; - - framework::LoDTensor *rpn_rois_; - framework::LoDTensor *rpn_probs_; - - int pre_nms_topn_; - int post_nms_topn_; - float nms_thresh_; - float min_size_; - float eta_; -#ifdef PADDLE_MOBILE_FPGA - std::shared_ptr float_score, float_bbox; - fpga::BypassArgs score_arg, bbox_arg; -#endif -}; - -DECLARE_KERNEL(Proposal, ProposalParam); -#endif - -#ifdef PSROI_POOL_OP -template -class PSRoiPoolParam : public OpParam { - public: - PSRoiPoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = OpParam::GetVarValue("X", inputs, *scope); - input_rois_ = - OpParam::GetVarValue("ROIs", inputs, *scope); - output_ = - OpParam::GetVarValue("Out", outputs, *scope); - - output_channels_ = OpParam::GetAttr("output_channels", attrs); - pooled_height_ = OpParam::GetAttr("pooled_height", attrs); - pooled_width_ = OpParam::GetAttr("pooled_width", attrs); - spatial_scale_ = OpParam::GetAttr("spatial_scale", attrs); - } - - public: - framework::Tensor *input_x_; - framework::LoDTensor *input_rois_; - framework::Tensor *output_; - int output_channels_; - int pooled_height_; - int pooled_width_; - float spatial_scale_; -#ifdef PADDLE_MOBILE_FPGA - std::shared_ptr float_input, float_output; - fpga::BypassArgs input_arg, output_arg; -#endif -}; - -DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam); -#endif - -#ifdef ROIALIGN_POOL_OP -template -class RoiAlignPoolParam : public OpParam { - public: - RoiAlignPoolParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = OpParam::GetVarValue("X", inputs, *scope); - input_rois_ = - OpParam::GetVarValue("ROIs", inputs, *scope); - output_ = - OpParam::GetVarValue("Out", outputs, *scope); - - pooled_height_ = OpParam::GetAttr("pooled_height", attrs); - pooled_width_ = OpParam::GetAttr("pooled_width", attrs); - spatial_scale_ = OpParam::GetAttr("spatial_scale", attrs); - sampling_ratio_ = OpParam::GetAttr("sampling_ratio", attrs); - } - - public: - framework::Tensor *input_x_; - framework::LoDTensor *input_rois_; - framework::Tensor *output_; - int pooled_height_; - int pooled_width_; - float spatial_scale_; - int sampling_ratio_; -#ifdef PADDLE_MOBILE_FPGA - std::shared_ptr float_input, float_output; - fpga::BypassArgs input_arg, output_arg; -#endif -}; - -DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam); -#endif - -#ifdef ROI_PERSPECTIVE_OP -template -class RoiPerspectiveParam : public OpParam { - public: - RoiPerspectiveParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = OpParam::GetVarValue("X", inputs, *scope); - input_rois_ = - OpParam::GetVarValue("ROIs", inputs, *scope); - output_ = - OpParam::GetVarValue("Out", outputs, *scope); - transform_Matrix_ = OpParam::GetVarValue( - "TransformMatrix", outputs, *scope); - mask = OpParam::GetVarValue("Mask", outputs, *scope); - - spatial_scale_ = OpParam::GetAttr("spatial_scale", attrs); - transformed_height_ = OpParam::GetAttr("transformed_height", attrs); - transformed_width_ = OpParam::GetAttr("transformed_width", attrs); - } - - public: - framework::Tensor *input_x_; - framework::LoDTensor *input_rois_; - framework::Tensor *output_; - framework::Tensor *transform_Matrix_; - framework::Tensor *mask; - - float spatial_scale_; - int transformed_height_; - int transformed_width_; -}; - -DECLARE_KERNEL(RoiPerspective, RoiPerspectiveParam); -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/dropout_kernel.h b/mobile/src/operators/kernel/dropout_kernel.h deleted file mode 100644 index 2f59d01b67..0000000000 --- a/mobile/src/operators/kernel/dropout_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DROPOUT_OP - -#include "framework/operator.h" -#include "operators/op_param.h" - -#pragma once - -namespace paddle_mobile { -namespace operators { - -template -class DropoutKernel - : public framework::OpKernelBase> { - public: - void Compute(const DropoutParam& param); - bool Init(DropoutParam* para); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h b/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h deleted file mode 100644 index 3bd8093adb..0000000000 --- a/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_DWCONVBNRELU_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class DWConvBNReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionDWConvBNReluParam ¶m); - bool Init(FusionDWConvBNReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/elementwise_add_kernel.h b/mobile/src/operators/kernel/elementwise_add_kernel.h deleted file mode 100644 index 8fa07e519e..0000000000 --- a/mobile/src/operators/kernel/elementwise_add_kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEADD_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/math/elementwise_op_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using namespace framework; - -template -class ElementwiseAddKernel - : public framework::OpKernelBase> { - public: - void Compute(const ElementwiseAddParam ¶m); - bool Init(ElementwiseAddParam *param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/elementwise_add_relu_kernel.h b/mobile/src/operators/kernel/elementwise_add_relu_kernel.h deleted file mode 100644 index d18c4e27fa..0000000000 --- a/mobile/src/operators/kernel/elementwise_add_relu_kernel.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_ELEMENTWISEADDRELU_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using namespace framework; - -template -class ElementwiseAddReluKernel - : public framework::OpKernelBase> { - public: - void Compute(const ElementwiseAddReluParam ¶m); - bool Init(ElementwiseAddReluParam *param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/elementwise_mul_kernel.h b/mobile/src/operators/kernel/elementwise_mul_kernel.h deleted file mode 100644 index f71b6257d5..0000000000 --- a/mobile/src/operators/kernel/elementwise_mul_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEMUL_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class ElementwiseMulKernel - : public framework::OpKernelBase> { - public: - void Compute(const ElementwiseMulParam ¶m); - bool Init(ElementwiseMulParam *param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/elementwise_sub_kernel.h b/mobile/src/operators/kernel/elementwise_sub_kernel.h deleted file mode 100644 index 89536b9208..0000000000 --- a/mobile/src/operators/kernel/elementwise_sub_kernel.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEADD_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/math/elementwise_op_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class ElementwiseSubKernel - : public framework::OpKernelBase> { - public: - void Compute(const ElementwiseSubParam ¶m); - bool Init(ElementwiseSubParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/exp_kernel.h b/mobile/src/operators/kernel/exp_kernel.h deleted file mode 100644 index ed7c4296f8..0000000000 --- a/mobile/src/operators/kernel/exp_kernel.h +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef EXP_OP - -#include -#include "framework/operator.h" -namespace paddle_mobile { -namespace operators { -DECLARE_KERNEL(EXP, EXPParam) -} -} // namespace paddle_mobile -#endif // EXP_OP diff --git a/mobile/src/operators/kernel/expand_kernel.h b/mobile/src/operators/kernel/expand_kernel.h deleted file mode 100644 index 00c12a9372..0000000000 --- a/mobile/src/operators/kernel/expand_kernel.h +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef EXPAND_OP -DECLARE_KERNEL(Expand, ExpandParam); -#endif // EXPAND_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fc_relu_kernel.h b/mobile/src/operators/kernel/fc_relu_kernel.h deleted file mode 100644 index 6735a50bee..0000000000 --- a/mobile/src/operators/kernel/fc_relu_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_FCRELU_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/math/math_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class FusionFcReluKernel - : public framework::OpKernelBase> { - public: - void Compute(const FusionFcReluParam& param); - bool Init(FusionFcReluParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/feed_kernel.h b/mobile/src/operators/kernel/feed_kernel.h deleted file mode 100644 index 2f6fb6b31d..0000000000 --- a/mobile/src/operators/kernel/feed_kernel.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class FeedKernel - : public framework::OpKernelBase> { - public: - void Compute(const FeedParam ¶m); - bool Init(FeedParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fetch_kernel.h b/mobile/src/operators/kernel/fetch_kernel.h deleted file mode 100644 index d9ed91855d..0000000000 --- a/mobile/src/operators/kernel/fetch_kernel.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using namespace framework; - -template -class FetchKernel - : public framework::OpKernelBase> { - public: - void Compute(const FetchParam ¶m); - bool Init(FetchParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/flatten2_kernel.h b/mobile/src/operators/kernel/flatten2_kernel.h deleted file mode 100644 index 78b3e820e6..0000000000 --- a/mobile/src/operators/kernel/flatten2_kernel.h +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// -// Created by hujie09 on 2019-07-31. -// - -#ifdef FLATTEN2_OP -#include -#include "framework/operator.h" -namespace paddle_mobile { -namespace operators { -DECLARE_KERNEL(Flatten2, FlattenParam) -} -} // namespace paddle_mobile - -#endif // FLATTEN2_KERNEL diff --git a/mobile/src/operators/kernel/flatten_kernel.h b/mobile/src/operators/kernel/flatten_kernel.h deleted file mode 100644 index 4846725bcb..0000000000 --- a/mobile/src/operators/kernel/flatten_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FLATTEN_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class FlattenKernel - : public framework::OpKernelBase> { - public: - void Compute(const FlattenParam& param); - bool Init(FlattenParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp deleted file mode 100644 index 8debe5afac..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBN_OP - -#include "operators/kernel/conv_add_bn_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddBNKernel::Init(FusionConvAddBNParam* param) { - // bool relu_enabled = false; - zynqmp::PE& conv = param.context().convPE(); - ConvParam& p = conv.param(); - p.input = param->Input()->ZynqTensor(); - p.filter = param->Filter()->ZynqTensor(); - - BatchnormParam* bn = new BatchnormParam(); - p.bn = bn; - - return true; -} - -template <> -void ConvAddBNKernel::Compute( - const FusionConvAddBNParam& param) { - zynqmp::PE& conv = param.context().convPE(); - conv.dispatch(); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp deleted file mode 100644 index 0214f2231b..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP - -#include "operators/kernel/conv_add_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddKernel::Init(FusionConvAddParam *param) { - return true; -} - -template <> -void ConvAddKernel::Compute( - const FusionConvAddParam ¶m) {} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp deleted file mode 100644 index e0170a7de5..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDRELU_OP - -#include "operators/kernel/conv_add_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { - return true; -} - -template <> -void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) {} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp deleted file mode 100644 index a137c920c3..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBN_OP - -#include "operators/kernel/conv_bn_kernel.h" -#include "fpga/KD/pes/conv_pe.hpp" - -using ConvPE = paddle_mobile::zynqmp::ConvPE; - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvBNKernel::Init(FusionConvBNParam* param) { - param->Output()->mutable_data(); - - ConvPE& pe = param->context().pe(); - zynqmp::ConvParam& conv_param = pe.param(); - zynqmp::BatchnormParam* bn_param = new zynqmp::BatchnormParam(); - bn_param->bias = param->InputBias()->zynqmpTensor(); - bn_param->scale = param->InputScale()->zynqmpTensor(); - bn_param->mean = param->InputMean()->zynqmpTensor(); - bn_param->variance = param->InputVariance()->zynqmpTensor(); - bn_param->epsilon = param->Epsilon(); - conv_param.input = param->Input()->zynqmpTensor(); - conv_param.output = param->Output()->zynqmpTensor(); - conv_param.filter = param->Filter()->zynqmpTensor(); - conv_param.batchnorm = bn_param; - conv_param.relu.enabled = false; - conv_param.groups = param->Groups(); - conv_param.strides = param->Strides(); - conv_param.paddings = param->Paddings(); - pe.init(); - pe.apply(); - return true; -} - -template <> -void ConvBNKernel::Compute(const FusionConvBNParam& param) { - std::cout << "ConvBNKernel\n"; - zynqmp::Context& context = const_cast(param.context_); - ConvPE& pe = context.pe(); - pe.dispatch(); - - std::string path = - "bn_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt"; - // param.Output()->zynqmpTensor()->saveToFile(path); - - // param.Output()->zynqmpTensor()->saveToFile(); - std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0] - << std::endl; -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp deleted file mode 100644 index 5b3b1deb1c..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNRELU_OP - -#include "operators/kernel/conv_bn_relu_kernel.h" -#include "fpga/KD/pes/conv_pe.hpp" - -#include - -using ConvPE = paddle_mobile::zynqmp::ConvPE; - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvBNReluKernel::Init(FusionConvBNReluParam* param) { - param->Output()->mutable_data(); - - ConvPE& pe = param->context().pe(); - zynqmp::ConvParam& conv_param = pe.param(); - zynqmp::BatchnormParam* bn_param = new zynqmp::BatchnormParam(); - bn_param->bias = param->InputBias()->zynqmpTensor(); - bn_param->scale = param->InputScale()->zynqmpTensor(); - bn_param->mean = param->InputMean()->zynqmpTensor(); - bn_param->variance = param->InputVariance()->zynqmpTensor(); - bn_param->epsilon = param->Epsilon(); - conv_param.input = param->Input()->zynqmpTensor(); - conv_param.output = param->Output()->zynqmpTensor(); - conv_param.filter = param->Filter()->zynqmpTensor(); - conv_param.batchnorm = bn_param; - conv_param.relu.enabled = true; - conv_param.groups = param->Groups(); - conv_param.strides = param->Strides(); - conv_param.paddings = param->Paddings(); - pe.init(); - pe.apply(); - return true; -} -template <> -void ConvBNReluKernel::Compute( - const FusionConvBNReluParam& param) { - std::cout << "ConvBNReluKernel\n"; - zynqmp::Context& context = const_cast(param.context_); - ConvPE& pe = context.pe(); - pe.dispatch(); - - std::string path = - "bnr_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt"; - // param.Output()->zynqmpTensor()->saveToFile(path); - std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0] - << std::endl; - - if (isinf(param.Output()->zynqmpTensor()->scale()[0])) { - // zynqmp::ConvParam& conv_param = pe.param(); - std::cout << "invalid cale !!!!!!!!!!!!" << std::endl; - // std::cout << conv_param.convArgs.conv_arg[0].kernel.width << std::endl; - exit(-1); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp deleted file mode 100644 index 52e95158c4..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_ELEMENTWISEADDRELU_OP - -#include "operators/kernel/elementwise_add_relu_kernel.h" -#include "fpga/KD/pes/elementwise_add_pe.hpp" - -using ElementwiseAddPE = paddle_mobile::zynqmp::ElementwiseAddPE; - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseAddReluKernel::Init( - ElementwiseAddReluParam* param) { - param->Out()->mutable_data(); - - ElementwiseAddPE& pe = param->context().pe(); - zynqmp::ElementwiseAddParam& ew_param = pe.param(); - ew_param.inputs = { - param->InputX()->zynqmpTensor(), - param->InputY()->zynqmpTensor(), - }; - ew_param.output = param->Out()->zynqmpTensor(); - ew_param.relu.enabled = true; - - pe.init(); - pe.apply(); - return true; -} - -template <> -void ElementwiseAddReluKernel::Compute( - const ElementwiseAddReluParam& param) { - std::cout << "ElementwiseAddReluKernel\n"; - zynqmp::Context& context = const_cast(param.context_); - ElementwiseAddPE& pe = context.pe(); - pe.dispatch(); - - std::string path = - "ew_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt"; - // param.Out()->zynqmpTensor()->saveToFile(path); - std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0] - << std::endl; -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp deleted file mode 100644 index 7a0450c599..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/feed_kernel.h" -#include "fpga/KD/pes/input_pe.hpp" - -using InputParam = paddle_mobile::zynqmp::InputParam; -using InputPE = paddle_mobile::zynqmp::InputPE; - -namespace paddle_mobile { -namespace operators { - -template <> -bool FeedKernel::Init(FeedParam* param) { - int col = param->Col(); - auto input = const_cast(¶m->InputX()->at(col)); - - InputPE& pe = param->context().pe(); - InputParam& input_param = pe.param(); - input->mutable_data(); - zynqmp::Tensor* input_tensor = input->zynqmpTensor(); - input_param.input = input_tensor; - param->Out()->mutable_data(); - auto out = param->Out()->zynqmpTensor(); - input_param.output = out; - pe.init(); - - return true; -} - -template <> -void FeedKernel::Compute(const FeedParam& param) { - std::cout << "FeedKernel\n"; - zynqmp::Context& context = const_cast(param.context_); - InputPE& pe = context.pe(); - - int col = param.Col(); - auto input = const_cast(¶m.InputX()->at(col)); - InputParam& input_param = pe.param(); - input->mutable_data(); - zynqmp::Tensor* input_tensor = input->zynqmpTensor(); - input_param.input = input_tensor; - param.Out()->Resize(input->dims()); - param.Out()->mutable_data(); - auto out = param.Out()->zynqmpTensor(); - input_param.output = out; - pe.dispatch(); - - param.Out()->zynqmpTensor()->saveToFile("feed_out.txt"); -} -template class FeedKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp deleted file mode 100644 index 75b0e0ccf8..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "operators/kernel/fetch_kernel.h" -#include "fpga/KD/pes/output_pe.hpp" - -namespace paddle_mobile { -namespace operators { - -using OutputPE = zynqmp::OutputPE; - -template <> -bool FetchKernel::Init(FetchParam* param) { - auto input = param->InputX(); - int col = param->Col(); - auto output = &(param->Out()->at(col)); - output->Resize(input->dims()); - output->mutable_data(); - - zynqmp::Context& context = const_cast(param->context_); - OutputPE& pe = context.pe(); - zynqmp::OutputParam& out_param = pe.param(); - out_param.input = input->zynqmpTensor(); - out_param.output = output->zynqmpTensor(); - - pe.init(); - pe.apply(); - return true; -} - -template <> -void FetchKernel::Compute(const FetchParam& param) { - std::cout << "FetchKernel\n"; - zynqmp::Context& context = const_cast(param.context_); - OutputPE& pe = context.pe(); - pe.dispatch(); - - int col = param.Col(); - auto output = &(param.Out()->at(col)); - output->zynqmpTensor()->saveToFile("fetch_out.txt"); -} -template class FetchKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp deleted file mode 100644 index 5b564fe4b6..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_FC_OP - -#include "operators/kernel/fusion_fc_kernel.h" -#include "fpga/KD/pes/fully_connected_pe.hpp" - -namespace paddle_mobile { -namespace operators { - -using FullyConnectedPE = zynqmp::FullyConnectedPE; - -template <> -bool FusionFcKernel::Init(FusionFcParam* param) { - param->Out()->mutable_data(); - - FullyConnectedPE& pe = param->context().pe(); - zynqmp::FullyConnectedParam& fc_param = pe.param(); - fc_param.input = param->InputX()->zynqmpTensor(); - fc_param.output = param->Out()->zynqmpTensor(); - fc_param.filter = param->InputY()->zynqmpTensor(); - fc_param.bias = param->InputZ()->zynqmpTensor(); - pe.init(); - pe.apply(); - return true; -} - -template <> -void FusionFcKernel::Compute(const FusionFcParam& param) { - std::cout << "FusionFcKernel\n"; - zynqmp::Context& context = const_cast(param.context_); - FullyConnectedPE& pe = context.pe(); - pe.dispatch(); - - param.Out()->zynqmpTensor()->invalidate(); - std::string path = - "fc_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt"; - param.Out()->zynqmpTensor()->saveToFile(path); - std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0] - << std::endl; -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp deleted file mode 100644 index 69db4472c9..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef POOL_OP - -#include "operators/kernel/pool_kernel.h" -#include "fpga/KD/pes/pooling_pe.hpp" - -class PoolingArgs; -namespace paddle_mobile { -namespace operators { - -template <> -bool PoolKernel::Init(PoolParam* param) { - param->Output()->mutable_data(); - - zynqmp::PoolingPE& pe = param->context().pe(); - zynqmp::PoolingParam& pool_param = pe.param(); - - pool_param.input = param->Input()->zynqmpTensor(); - pool_param.output = param->Output()->zynqmpTensor(); - pool_param.type = param->PoolingType() == "max" - ? zynqmp::PoolingType::MAX - : zynqmp::PoolingType::AVERAGE; - pool_param.globalPooling = param->isGlobalPooling(); - pool_param.kernelSize = param->Ksize(); - pool_param.strides = param->Strides(); - pool_param.paddings = param->Paddings(); - - pe.init(); - pe.apply(); - return true; -} - -template <> -void PoolKernel::Compute(const PoolParam& param) { - std::cout << "PoolKernel\n"; - zynqmp::Context& context = const_cast(param.context_); - zynqmp::PoolingPE& pe = context.pe(); - pe.dispatch(); - - std::string path = - "pool_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt"; - param.Output()->zynqmpTensor()->saveToFile(path); - // param.Output()->zynqmpTensor()->saveToFile(); - std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0] - << std::endl; -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp deleted file mode 100644 index dace88c5a2..0000000000 --- a/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#include "operators/kernel/softmax_kernel.h" -#include "fpga/KD/pes/softmax_pe.hpp" -#include "operators/kernel/central-arm-func/softmax_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SoftmaxKernel::Init(SoftmaxParam* param) { - param->Out()->mutable_data(); - - zynqmp::SoftmaxPE& pe = param->context().pe(); - zynqmp::SoftmaxParam& fc_param = pe.param(); - fc_param.input = param->InputX()->zynqmpTensor(); - fc_param.output = param->Out()->zynqmpTensor(); - pe.init(); - pe.apply(); - return true; -} - -template <> -void SoftmaxKernel::Compute(const SoftmaxParam& param) { - std::cout << "SoftmaxKernel\n"; - zynqmp::Context& context = const_cast(param.context_); - zynqmp::SoftmaxPE& pe = context.pe(); - pe.dispatch(); - - param.Out()->zynqmpTensor()->invalidate(); - std::string path = - "softmax_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt"; - param.Out()->zynqmpTensor()->saveToFile(path); - std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0] - << std::endl; -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp deleted file mode 100644 index 31872411f7..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ANCHOR_GENERATOR_OP -#include -#include -#include -#include -#include -#include "operators/kernel/detection_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool AnchorGeneratorKernel::Init( - AnchorGeneratorParam *param) { - auto input = param->input_; - auto anchors = param->output_anchors_; - auto anchor_ptr = anchors->mutable_data(); - auto stride = param->stride_; - auto feature_width = input->dims()[3], feature_height = input->dims()[2]; - auto stride_width = stride[0], stride_height = stride[1]; - auto offset = param->offset_; - - int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23, - -20, 39, 36, -43, -34, 59, 49, -63, -54, - 79, 69, -96, -77, 112, 93, -137, -118, 153, - 134, -204, -188, 220, 204, -281, -395, 296, 441}; - - int anchors_offset2[] = {-18, -31, 34, 47, -22, -22, 38, 38, -33, - -44, 49, 60, -2, -2, 18, 18, -10, -14, - 26, 30, -14, -22, 30, 38, -9, -26, 25, - 42, -92, -92, 108, 108, -2, -15, 18, 31}; - - if (offset > 0.6) { - memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset)); - std::cout << "anchor generator marker" << std::endl; - } else { - std::cout << "anchor generator rfcn" << std::endl; - } - int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4); - - // DLOG << "feature_height: " << feature_height; - // DLOG << "feature_width: " << feature_width; - // DLOG << "num_anchors: " << num_anchors; - // DLOG << "stride_width: " << stride_width; - // DLOG << "stride_height: " << stride_height; - - for (int h_idx = 0; h_idx < feature_height; ++h_idx) { - int offset0 = h_idx * feature_width * num_anchors * 4; - for (int w_idx = 0; w_idx < feature_width; ++w_idx) { - int offset1 = w_idx * num_anchors * 4; - for (int idx = 0; idx < num_anchors; idx++) { - int offset = offset0 + offset1 + idx * 4; - anchor_ptr[offset + 0] = - anchors_offset[idx * 4 + 0] + w_idx * stride_width; - anchor_ptr[offset + 1] = - anchors_offset[idx * 4 + 1] + h_idx * stride_height; - anchor_ptr[offset + 2] = - anchors_offset[idx * 4 + 2] + w_idx * stride_width; - anchor_ptr[offset + 3] = - anchors_offset[idx * 4 + 3] + h_idx * stride_height; - } - } - } - return true; -} - -template <> -void AnchorGeneratorKernel::Compute( - const AnchorGeneratorParam ¶m) {} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ANCHOR_GENERATOR_OP diff --git a/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp deleted file mode 100644 index 7690f41ad3..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONCAT_OP - -#include "operators/kernel/concat_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConcatKernel::Init(ConcatParam *param) { - auto inputs = param->Inputs(); - auto out = param->Out(); - auto image_num = inputs.size(); - auto images_in = - (half **)fpga::fpga_malloc(image_num * sizeof(int *)); // NOLINT - auto scales_in = - (float **)fpga::fpga_malloc(image_num * sizeof(float *)); // NOLINT - auto channel_num = - (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); // NOLINT - - auto height = inputs[0]->dims()[2]; - auto width = inputs[0]->dims()[3]; - for (int i = 0; i < image_num; i++) { - auto input = inputs[i]; - PADDLE_MOBILE_ENFORCE( - input->dims()[2] == height && input->dims()[3] == width, - "Image height & width should be unified"); - images_in[i] = input->data(); - channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT - scales_in[i] = input->scale; - } - fpga::format_concat_output(out, height, width, image_num, channel_num); - - fpga::ConcatArgs concatArgs = {0}; - concatArgs.image_num = image_num; - concatArgs.images_in = images_in; - concatArgs.scales_in = scales_in; - concatArgs.image_out = out->data(); - concatArgs.scale_out = out->scale; - concatArgs.channel_num = channel_num; - concatArgs.height = height; - concatArgs.width = width; - param->SetFpgaArgs(concatArgs); - return true; -} - -template <> -void ConcatKernel::Compute(const ConcatParam ¶m) { - ComputeFPGAConcat(param.FpgaArgs()); -} -template class ConcatKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp deleted file mode 100644 index c052805dfd..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBN_OP - -#include "operators/kernel/conv_add_bn_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - - auto bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - - auto out = param->Output(); - - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && - bias->dims()[0] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - - const int channel = out->dims()[1]; - auto bs_ptr = - reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = - bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + channel] = new_scale_ptr[i]; - bs_ptr[i] = new_bias_ptr[i]; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - - delete new_scale; - delete new_bias; - - return true; -} - -template <> -void ConvAddBNKernel::Compute( - const FusionConvAddBNParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp deleted file mode 100755 index a7a93de9ba..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBNRELU_OP - -#include "operators/kernel/conv_add_bn_relu_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddBNReluKernel::Init( - FusionConvAddBNReluParam *param) { - // bool relu_enabled = true; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - - vector paddings = param->Paddings(); - vector strides = param->Strides(); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && - bias->dims()[0] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - - const int channel = out->dims()[1]; - auto bs_ptr = - reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = - bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + channel] = new_scale_ptr[i]; - bs_ptr[i] = new_bias_ptr[i]; - } - - const int groups = param->Groups(); - if (groups == channel) { - fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); - fpga::DWconvArgs dwconv_arg = {0}; - fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, strides[0], strides[1], - paddings[0], paddings[1], new_bias_ptr); - param->SetFpgaArgs(dwconv_arg); - fpga::fpga_free(new_scale_ptr); - fpga::fpga_free(bs_ptr); - } else { - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), strides[0], - strides[1], paddings[0], paddings[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - delete new_scale; - delete new_bias; - } - return true; -} - -template <> -void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWConv(param.FpgaDwconvArgs()); - } else { - fpga::ComputeFpgaConv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp deleted file mode 100644 index da16af58f1..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP - -#include "operators/kernel/conv_add_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddKernel::Init(FusionConvAddParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = 1; - bs_ptr[i] = bias_ptr[i]; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void ConvAddKernel::Compute( - const FusionConvAddParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp deleted file mode 100644 index f1f61da421..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDRELU_OP - -#include "operators/kernel/conv_add_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { - // bool relu_enabled = true; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = 1; - bs_ptr[i] = bias_ptr[i]; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp deleted file mode 100644 index 54d99f22d1..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBN_OP - -#include "operators/kernel/conv_bn_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvBNKernel::Init(FusionConvBNParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - const int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // // NOLINT - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + channel] = new_scale_ptr[i]; - bs_ptr[i] = new_bias_ptr[i]; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - delete new_scale; - delete new_bias; - return true; -} - -template <> -void ConvBNKernel::Compute(const FusionConvBNParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp deleted file mode 100644 index 4ce8265f7f..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNRELU_OP - -#include "operators/kernel/conv_bn_relu_kernel.h" -#include -namespace paddle_mobile { -namespace operators { -template <> -bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - const int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + channel] = new_scale_ptr[i]; - bs_ptr[i] = new_bias_ptr[i]; - } - const int groups = param->Groups(); - if (groups == channel) { - fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); - fpga::DWconvArgs dwconv_arg = {0}; - fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], new_bias_ptr); - param->SetFpgaArgs(dwconv_arg); - } else { - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - } - delete new_scale; - delete new_bias; - return true; -} -template <> -void ConvBNReluKernel::Compute( - const FusionConvBNReluParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWConv(param.FpgaDwconvArgs()); - } else { - fpga::ComputeFpgaConv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp deleted file mode 100644 index 57b5eb754e..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#include "operators/kernel/conv_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvKernel::Init(ConvParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = 1; - bs_ptr[i] = 0; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void ConvKernel::Compute(const ConvParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp deleted file mode 100644 index 1597885e43..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_TRANSPOSE_OP - -#include "operators/kernel/conv_transpose_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvTransposeKernel::Init(ConvTransposeParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - // const Tensor *bias = param->Bias(); - // auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - - // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - // "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = 0; // bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - return true; -} - -template <> -void ConvTransposeKernel::Compute( - const ConvTransposeParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp deleted file mode 100644 index a8205df3c9..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDBN_OP - -#include "operators/kernel/deconv_add_bn_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { - // bool relu_enabled = true; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->InputBias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - return true; -} - -template <> -void DeconvAddBNKernel::Compute( - const FusionDeconvAddBNParam ¶m) { - // fpga::ComputeFpgaDeconv(param.FpgaArgs()); - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp deleted file mode 100755 index b27f5cf870..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDBNRELU_OP - -#include "operators/kernel/deconv_add_bn_relu_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvAddBNReluKernel::Init( - FusionDeconvAddBNReluParam *param) { - // bool relu_enabled = true; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->InputBias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - return true; -} - -template <> -void DeconvAddBNReluKernel::Compute( - const FusionDeconvAddBNReluParam ¶m) { - // fpga::ComputeFpgaDeconv(param.FpgaArgs()); - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp deleted file mode 100644 index 41844d008b..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADD_OP - -#include "operators/kernel/deconv_add_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - - return true; -} - -template <> -void DeconvAddKernel::Compute( - const FusionDeconvAddParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp deleted file mode 100644 index c6fc9d1955..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDRELU_OP - -#include "operators/kernel/deconv_add_relu_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvAddReluKernel::Init( - FusionDeconvAddReluParam *param) { - // bool relu_enabled = true; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - return true; -} - -template <> -void DeconvAddReluKernel::Compute( - const FusionDeconvAddReluParam ¶m) { - // fpga::ComputeFpgaDeconv(param.FpgaArgs()); - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp deleted file mode 100644 index 75597f0ecd..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVBNRELU_OP - -#include "operators/kernel/deconv_bn_relu_kernel.h" -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvBNReluKernel::Init( - FusionDeconvBNReluParam *param) { - // bool relu_enabled = true; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->InputBias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - } - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel]; - bs_ptr[i] = new_bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, - activation_enable, leaky_relu_negative_slope, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - delete new_scale; - delete new_bias; - return true; -} - -template <> -void DeconvBNReluKernel::Compute( - const FusionDeconvBNReluParam ¶m) { - // fpga::ComputeFpgaDeconv(param.FpgaArgs()); - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp deleted file mode 100644 index 8b990d46e0..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DROPOUT_OP - -#include "operators/kernel/dropout_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DropoutKernel::Init(DropoutParam *param) { - param->Out()->ShareDataWith(*param->InputX()); - return true; -} - -template <> -void DropoutKernel::Compute(const DropoutParam ¶m) {} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp deleted file mode 100644 index db4d2afbc1..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp +++ /dev/null @@ -1,191 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef ELEMENTWISEADD_OP - -#include "operators/kernel/elementwise_add_kernel.h" - -#include -#include "fpga/V1/api.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { - auto *input_y = const_cast(param->InputY()); - auto *out = param->Out(); - if (input_y->type() != type_id()) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto *input_x = const_cast(param->InputX()); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); - fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); - - fpga::EWAddArgs ewaddArgs = {0}; - // ewaddArgs.relu_enabled = relu_enabled; - ewaddArgs.output.activation.activation_type = activation_enable; - ewaddArgs.output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; - ewaddArgs.const0 = 0x3c00; // =1 - ewaddArgs.const1 = 0x3c00; // =1 - ewaddArgs.image0.address = input_x_ptr; - ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; - ewaddArgs.image0.scale_address = input_x->scale; - ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; - ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; - ewaddArgs.image0.pad_height = 0; - ewaddArgs.image0.pad_width = 0; - ewaddArgs.image1.address = input_y_ptr; - ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; - ewaddArgs.image1.scale_address = input_y->scale; - ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; - ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; - ewaddArgs.image1.pad_height = 0; - ewaddArgs.image1.pad_width = 0; - ewaddArgs.output.scale_address = out->scale; - ewaddArgs.output.address = out_ptr; - fpga::expand_EW_arg(&ewaddArgs); - param->SetFpgaArgs(ewaddArgs); - } else { - param->float_input_x.Resize(param->InputX()->dims()); - param->float_input_x.init(type_id().hash_code()); - fpga::format_fp32_ofm(&(param->float_input_x)); - - param->float_out.Resize(param->InputX()->dims()); - param->float_out.mutable_data(param->InputX()->dims()); - fpga::format_fp32_ofm(&(param->float_out)); - - fpga::format_fp16_ofm(out); - } - return true; -} -inline void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { - auto input_x = param.float_input_x; - auto input_y = param.InputY(); - auto Out = param.float_out; - int axis = param.Axis(); - - const auto &x_dims = input_x.dims(); - const auto &y_dims = input_y->dims(); - /// axis = -1 represent the last dimensions. - axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); - size_t batch = 1; - size_t channels = 1; - size_t elementwise_num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - elementwise_num *= x_dims[i]; - } - const float *bias_data = input_y->data(); - const float *input_data = input_x.data(); - float *output_data = Out.mutable_data(); - - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - size_t offset = (i * channels + j) * elementwise_num; - const float *input = input_data + offset; - const float bias = bias_data[j]; - float *output = output_data + offset; - // DLOG << "output address: "<< output; - for (int k = 0; k < elementwise_num; ++k) { - output[k] = input[k] + bias; - // DLOG << "output[" << k << "]= " << output[k] ; - } - } - } -} -template <> -void ElementwiseAddKernel::Compute( - const ElementwiseAddParam ¶m) { - auto input_y = const_cast(param.InputY()); - if (input_y->type() != type_id()) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); - } else { - auto input_x = const_cast(param.InputX()); - auto intput_x_float = const_cast(&(param.float_input_x)); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = input_x->data(); - args.image.channels = (uint32_t)(input_x->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = intput_x_float->data(); - args.output.scale_address = intput_x_float->scale; - - // fpga::fpga_flush(input_x->data(),input_x->fpga_data_num * - // sizeof(half)); - fpga::PerformBypass(args); - fpga::fpga_invalidate(args.output.address, - input_x->fpga_data_num * sizeof(float)); - - // just for test - /* { - static int cnt = 0; - if(cnt == 0){ - std::string str= "first_bypass_data"; - float rslt = 0.0f; - fpga::savefile(str, args.output.address, input_x->fpga_data_num, - rslt); cnt++; - } - }*/ - ElementwiseAddCompute(param); - - auto out_float = const_cast(&(param.float_out)); - DLOG << "out float: " << out_float->data(); - fpga::fpga_flush(out_float->data(), - input_x->fpga_data_num * sizeof(float)); - // just for test - /*{ - static int cnt = 0; - if(cnt == 0){ - std::string str= "ew_output_data"; - float rslt = 0.0f; - - fpga::savefile(str, out_float->data(), input_x->fpga_data_num, - rslt); cnt++; - } - }*/ - auto Out = param.Out(); - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = out_float->data(); - args.image.channels = (uint32_t)(input_x->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = Out->data(); - args.output.scale_address = Out->scale; - fpga::PerformBypass(args); - } -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp deleted file mode 100644 index f36206a8a1..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_ELEMENTWISEADDRELU_OP - -#include "operators/kernel/elementwise_add_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseAddReluKernel::Init( - ElementwiseAddReluParam *param) { - // bool relu_enabled = true; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto *input_x = const_cast(param->InputX()); - auto *input_y = const_cast(param->InputY()); - auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); - fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); - - fpga::EWAddArgs ewaddArgs = {0}; - // ewaddArgs.relu_enabled = relu_enabled; - ewaddArgs.output.activation.activation_type = activation_enable; - ewaddArgs.output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; - ewaddArgs.const0 = 0x3c00; // =1 - ewaddArgs.const1 = 0x3c00; // =1 - ewaddArgs.image0.address = input_x_ptr; - ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; - ewaddArgs.image0.scale_address = input_x->scale; - ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; - ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; - ewaddArgs.image0.pad_height = 0; - ewaddArgs.image0.pad_width = 0; - ewaddArgs.image1.address = input_y_ptr; - ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; - ewaddArgs.image1.scale_address = input_y->scale; - ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; - ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; - ewaddArgs.image1.pad_height = 0; - ewaddArgs.image1.pad_width = 0; - ewaddArgs.output.scale_address = out->scale; - ewaddArgs.output.address = out_ptr; - fpga::expand_EW_arg(&ewaddArgs); - param->SetFpgaArgs(ewaddArgs); - return true; -} - -template <> -void ElementwiseAddReluKernel::Compute( - const ElementwiseAddReluParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp deleted file mode 100644 index d744ae2c07..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEMUL_OP - -#include "operators/kernel/elementwise_mul_kernel.h" -#include "operators/math/elementwise_op_function.h" - -namespace paddle_mobile { -namespace operators { - -template -struct MulFunctor { - inline T operator()(T a, T b) const { return a * b; } -}; -template <> -bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { - param->float_input_x.Resize(param->InputX()->dims()); - param->float_input_x.init(type_id().hash_code()); - fpga::format_fp32_ofm(&(param->float_input_x)); - - param->float_out.Resize(param->InputX()->dims()); - param->float_out.init(type_id().hash_code()); - fpga::format_fp32_ofm(&(param->float_out)); - - auto *out = param->Out(); - fpga::format_fp16_ofm(out); - return true; -} - -template <> -void ElementwiseMulKernel::Compute( - const ElementwiseMulParam ¶m) { - auto input_x = const_cast(param.InputX()); - auto intput_x_float = const_cast(&(param.float_input_x)); - // auto intput_x_32_ptr = - // const_cast(param.float_input_x.data()); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = input_x->data(); - args.image.channels = (uint32_t)(input_x->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = intput_x_float->data(); - args.output.scale_address = intput_x_float->scale; - fpga::PerformBypass(args); - fpga::fpga_invalidate(args.output.address, - input_x->fpga_data_num * sizeof(float)); - - auto input_y = param.InputY(); - int axis = param.Axis(); - auto out_float = const_cast(&(param.float_out)); - ElementwiseComputeEx, float>( - intput_x_float, input_y, axis, MulFunctor(), out_float); - fpga::fpga_flush(out_float->data(), - input_x->fpga_data_num * sizeof(float)); - - Tensor *Out = param.Out(); - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = out_float->data(); - args.image.channels = (uint32_t)(Out->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = Out->data(); - args.output.scale_address = Out->scale; - fpga::PerformBypass(args); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp deleted file mode 100644 index 28559b2b4b..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/feed_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FeedKernel::Init(FeedParam *param) { - auto output = param->Out(); - int col = param->Col(); - DLOG << "col = " << col; - auto input = const_cast(¶m->InputX()->at(col)); - input->init(type_id().hash_code()); - input->Resize(output->dims()); - - if (output->dims().size() != 4) { - return true; - } - - fpga::format_fp16_ofm(output); - return true; -} - -template <> -void FeedKernel::Compute(const FeedParam ¶m) { - auto output = param.Out(); - int col = param.Col(); - auto input = const_cast(¶m.InputX()->at(col)); - kTypeId_t input_type = input->type(); - - if (input_type == type_id()) { - input->init(type_id().hash_code()); - } else { - input->init(type_id().hash_code()); - } - input->Resize(output->dims()); - - if (output->dims().size() != 4) { - size_t size = output->numel() * sizeof(float); - auto output_ptr = output->data(); - auto input_ptr = input->data(); - auto external_ptr = reinterpret_cast(input->external_data); - float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - memcpy(output_ptr, p_data, size); - input->external_data = nullptr; - return; - } - - fpga::format_image(input); - auto output_ptr = output->data(); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; - if (input_type == type_id()) { - auto input_ptr = input->data(); - auto external_ptr = reinterpret_cast(input->external_data); - float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = p_data; - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); - input->external_data = nullptr; - } else { - auto input_ptr = input->data(); - auto external_ptr = reinterpret_cast(input->external_data); - int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - - args.input_data_type = fpga::DATA_TYPE_INT8; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = p_data; - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); - input->external_data = nullptr; - } -} -template class FeedKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp deleted file mode 100644 index 87ede2af1a..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "operators/kernel/fetch_kernel.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool FetchKernel::Init(FetchParam *param) { - auto input = const_cast(param->InputX()); - int col = param->Col(); - DLOG << "col = " << col; - auto output = &(param->Out()->at(col)); - if (input->type() == type_id()) { - return true; - } - output->init(type_id().hash_code()); - output->Resize(input->dims()); - fpga::format_fp32_ofm(output); - int outC = 1; - int outH = 1; - int outW = 1; - if (output->dims().size() == 4) { - outC = output->dims()[1]; - outH = output->dims()[2]; - outW = output->dims()[3]; - } else { // 2 - outC = output->dims()[1]; - } - int unalignedCW = outC * outW; - int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT); - if (alignedCW != unalignedCW) { - param->aligned_out.Resize(input->dims()); - param->aligned_out.mutable_data(input->dims()); - fpga::fpga_flush(param->aligned_out.data(), - outH * unalignedCW * sizeof(float)); - } - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = input->data(); - args.image.channels = (uint32_t)(input->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output->data(); - args.output.scale_address = output->scale; - param->fpga_bypass_args = args; - - return true; -} -void dealign(float *src, float *dst, int input_c, int input_h, int input_w) { - int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16); - int dealignCW = input_c * input_w; - for (int h = 0; h < input_h; ++h) { - auto input_offset = h * alignCW; - auto output_offset = h * dealignCW; - memcpy((dst + output_offset), (src + input_offset), - dealignCW * sizeof(float)); - } -} -template <> -void FetchKernel::Compute(const FetchParam ¶m) { - auto input = const_cast(param.InputX()); - int col = param.Col(); - auto output = ¶m.Out()->at(col); - if (input->type() == type_id()) { - output->ShareDataWith(*input); - return; - } - - fpga::BypassArgs args = param.fpga_bypass_args; - auto input_address = (input->data()); - args.image.address = static_cast(input_address); - float *outdata_ptr = - reinterpret_cast(param.fpga_bypass_args.output.address); - const int num_th = 32; - if (output->fpga_data_num < num_th) { - fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half)); - - for (int idx = 0; idx < product(input->dims()); ++idx) { - outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]); - } - return; - } - - fpga::PerformBypass(args); - int outC = 1; - int outH = 1; - int outW = 1; - if (output->dims().size() == 4) { - outC = output->dims()[1]; - outH = output->dims()[2]; - outW = output->dims()[3]; - } else { // 2 - outC = output->dims()[1]; - } - - fpga::fpga_invalidate(param.fpga_bypass_args.output.address, - output->fpga_data_num * sizeof(float)); - int unalignedCW = outC * outW; - int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT); - if (unalignedCW != alignedCW) { - auto aligned_ptr = const_cast(param.aligned_out.data()); - dealign(outdata_ptr, aligned_ptr, outC, outH, outW); - memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float)); - fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float)); - } -} -template class FetchKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp deleted file mode 100644 index 3a29104d0f..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_FC_OP - -#include "operators/kernel/fusion_fc_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FusionFcKernel::Init(FusionFcParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input_x = const_cast(param->InputX()); - auto filter = const_cast(param->InputY()); - const Tensor *input_z = param->InputZ(); - auto input_z_ptr = input_z->data(); - auto out = param->Out(); - - // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - // "Image channel should be equal to weight number"); - int channel = (uint32_t)out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = 1; - bs_ptr[i] = input_z_ptr[i]; - } - int num = (uint32_t)filter->dims()[1]; - int chw = (uint32_t)filter->dims()[0]; - PADDLE_MOBILE_ENFORCE( - chw == input_x->numel(), - "Filter element num should be equal to IFM element num"); - int height = (uint32_t)input_x->dims()[2]; - int width = (uint32_t)input_x->dims()[3]; - int filter_channel = chw / height / width; - - out->Resize(framework::make_ddim({1, channel, 1, 1})); - filter->Resize(framework::make_ddim({num, filter_channel, height, width})); - float max_value = fpga::filter_find_max(filter); - fpga::format_fc_filter(filter, max_value); - - int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_fp16_ofm(out); - - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, - leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void FusionFcKernel::Compute(const FusionFcParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp deleted file mode 100644 index fef370515e..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_FCRELU_OP - -#include "operators/kernel/fc_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FusionFcReluKernel::Init(FusionFcReluParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input_x = const_cast(param->InputX()); - auto filter = const_cast(param->InputY()); - const Tensor *input_z = param->InputZ(); - auto input_z_ptr = input_z->data(); - auto out = param->Out(); - - // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - // "Image channel should be equal to weight number"); - int channel = (uint32_t)out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = 1; - bs_ptr[i] = input_z_ptr[i]; - } - int num = (uint32_t)filter->dims()[1]; - int chw = (uint32_t)filter->dims()[0]; - PADDLE_MOBILE_ENFORCE( - chw == input_x->numel(), - "Filter element num should be equal to IFM element num"); - int height = (uint32_t)input_x->dims()[2]; - int width = (uint32_t)input_x->dims()[3]; - int filter_channel = chw / height / width; - - out->Resize(framework::make_ddim({1, channel, 1, 1})); - filter->Resize(framework::make_ddim({num, filter_channel, height, width})); - float max_value = fpga::filter_find_max(filter); - fpga::format_fc_filter(filter, max_value); - - int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_fp16_ofm(out); - - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, - leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void FusionFcReluKernel::Compute( - const FusionFcReluParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp deleted file mode 100644 index 370b34e863..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/pad2d_kernel.h" -namespace paddle_mobile { -namespace operators { -template <> -bool Pad2DKernel::Init(Pad2DParam *param) { - Tensor *output = param->Out(); - fpga::format_fp16_ofm(output); - return true; -} -void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) { - auto input_data = (input->data()); - auto output_data = (output->data()); - auto input_c = input->dims()[1]; - auto input_h = input->dims()[2]; - auto input_w = input->dims()[3]; - auto output_c = output->dims()[1]; - auto output_w = output->dims()[3]; - auto copysize = input_c * input_w; - for (int h = 0; h < input_h; ++h) { - auto input_offset = h * input_c * input_w; - auto output_offset = h * paddle_mobile::fpga::align_to_x( - output_c * output_w, IMAGE_ALIGNMENT); - memcpy((output_data + output_offset), (input_data + input_offset), - copysize * sizeof(half)); - } -} -template <> -void Pad2DKernel::Compute(const Pad2DParam ¶m) { - auto in_x = param.InputX(); - auto out = param.Out(); - fpga::fpga_invalidate((void *)in_x->data(), // NOLINT - in_x->numel() * sizeof(half)); - pad2dFunc(in_x, out); - (out->scale)[0] = (in_x->scale)[0]; - (out->scale)[1] = (in_x->scale)[1]; - DLOG << (out->scale)[0]; - DLOG << (out->scale)[1]; - size_t outputSize = - out->dims()[2] * - paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]), - IMAGE_ALIGNMENT) * - sizeof(half); - fpga::fpga_flush(out->data(), outputSize); -} -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp deleted file mode 100644 index 7c8dba1696..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef POOL_OP - -#include "operators/kernel/pool_kernel.h" - -class PoolingArgs; -namespace paddle_mobile { -namespace operators { - -template <> -bool PoolKernel::Init(PoolParam *param) { - auto *input = const_cast(param->Input()); - auto *output = param->Output(); - vector ksize = param->Ksize(); - vector strides = param->Strides(); - vector paddings = param->Paddings(); - std::string pooling_type = param->PoolingType(); - - if (input->type() == type_id()) { - int channels = input->dims()[1]; - int height = input->dims()[2]; - int width = input->dims()[3]; - int num = input->dims()[0]; - int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1; - int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1; - framework::DDim dim = - framework::make_ddim({num, channels, out_height, out_width}); - output->mutable_data(dim); - return true; - } - - auto input_ptr = input->data(); - fpga::format_fp16_ofm(output); - auto output_ptr = output->mutable_data(); - - fpga::PoolingArgs poolArgs = {0}; - poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1 - poolArgs.kernel_reciprocal = - fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1]))); // NOLINT - poolArgs.image.address = input_ptr; - poolArgs.image.channels = (uint32_t)input->dims()[1]; - poolArgs.image.height = (uint32_t)input->dims()[2]; - poolArgs.image.width = (uint32_t)input->dims()[3]; - poolArgs.image.pad_height = (uint32_t)paddings[0]; - poolArgs.image.pad_width = (uint32_t)paddings[1]; - poolArgs.image.scale_address = input->scale; - poolArgs.output.address = output_ptr; - poolArgs.output.scale_address = output->scale; - poolArgs.kernel.height = (uint32_t)ksize[0]; - poolArgs.kernel.width = (uint32_t)ksize[1]; - poolArgs.kernel.stride_h = (uint32_t)strides[0]; - poolArgs.kernel.stride_w = (uint32_t)strides[1]; - param->SetFpgaArgs(poolArgs); - return true; -} - -template <> -void PoolKernel::Compute(const PoolParam ¶m) { - auto *input = const_cast(param.Input()); - - if (input->type() == type_id()) { - auto *output = param.Output(); - auto in = input->data(); - auto N = input->dims()[0]; - output->Resize( - {N, output->dims()[1], output->dims()[2], output->dims()[3]}); - auto len = output->numel(); - auto out = output->mutable_data(); - int C = input->dims()[1], H = input->dims()[2], // N = input->dims()[0], - W = input->dims()[3]; - int HW = H * W, CHW = C * H * W, WC = W * C; - - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - out[n * C + c] = 0; - for (int h = 0; h < H; h++) { - for (int w = 0; w < W; w++) { - out[n * C + c] += in[n * CHW + h * WC + w * C + - c]; // in[n * CHW + c * HW + h * W + w]; // - } - } - out[n * C + c] /= HW; - } - } - return; - } - fpga::ComputeFpgaPool(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp deleted file mode 100644 index bd6703bb81..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp +++ /dev/null @@ -1,567 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PROPOSAL_OP - -#include -#include -#include -#include "operators/kernel/detection_kernel.h" - -namespace paddle_mobile { -namespace operators { - -static const double kBBoxClipDefault = std::log(1000.0 / 16.0); - -template <> -bool ProposalKernel::Init(ProposalParam *param) { - int post_nms_top_n = param->post_nms_topn_; - int64_t batch = param->scores_->dims()[0]; - auto total = post_nms_top_n * batch; - param->rpn_rois_->mutable_data({total, 4}); - param->rpn_probs_->mutable_data({total, 1}); - - // DLOG << *param->rpn_rois_; - // DLOG << *param->rpn_probs_; - - param->float_bbox = std::make_shared(); - param->float_bbox->Resize(param->bbox_deltas_->dims()); - param->float_bbox->init(type_id().hash_code()); - fpga::format_fp32_ofm(param->float_bbox.get()); - param->float_score = std::make_shared(); - param->float_score->Resize(param->scores_->dims()); - param->float_score->init(type_id().hash_code()); - fpga::format_fp32_ofm(param->float_score.get()); - - auto input = param->bbox_deltas_; - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_HWC; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_bbox->mutable_data(); - args.output.scale_address = param->float_bbox->scale; - param->bbox_arg = args; - - input = param->scores_; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_score->mutable_data(); - args.output.scale_address = param->float_score->scale; - param->score_arg = args; - - param->score_index_ = std::make_shared(); - param->score_index_->mutable_data({input->numel()}); - auto score_index = param->score_index_->data(); - for (int i = 0; i < input->numel(); ++i) { - score_index[i] = i; - } - - return true; -} -template -void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) { - PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 || - (index.dims().size() == 2 && index.dims()[1] == 1), - "Dim not correct"); - int64_t index_size = index.dims()[0]; - - auto src_dims = src.dims(); - - const T *p_src = src.data(); - const int *p_index = index.data(); - T *p_output = output->data(); - - // slice size - int slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; - - const size_t slice_bytes = slice_size * sizeof(T); - - for (int64_t i = 0; i < index_size; ++i) { - int index_ = p_index[i]; - memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); - } -} - -void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { - auto *out_data = dst->data(); - auto *to_add_data = src.data(); - size_t size_of_t = framework::SizeOfType(src.type()); - offset *= size_of_t; - std::memcpy( - reinterpret_cast(reinterpret_cast(out_data) + offset), - to_add_data, src.numel() * size_of_t); -} - -template -static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, - Tensor *variances, Tensor *proposals) { - T *proposals_data = proposals->mutable_data(); - - int64_t row = all_anchors->dims()[0]; - int64_t len = all_anchors->dims()[1]; - - auto *bbox_deltas_data = bbox_deltas->data(); - auto *anchor_data = all_anchors->data(); - const T *variances_data = nullptr; - if (variances) { - variances_data = variances->data(); - } - - for (int64_t i = 0; i < row; ++i) { - T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; - T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; - - T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; - T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; - - T bbox_center_x = 0, bbox_center_y = 0; - T bbox_width = 0, bbox_height = 0; - - /* - if (variances) { - bbox_center_x = - variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width - + anchor_center_x; bbox_center_y = variances_data[i * len + 1] * - bbox_deltas_data[i * len + 1] * anchor_height + - anchor_center_y; - bbox_width = std::exp(std::min(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - } else { - */ - bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; - bbox_center_y = - bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - - /* - bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - */ - bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; - bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; - // } - - proposals_data[i * len] = bbox_center_x - bbox_width / 2; - proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - /* - //wong - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; - //wong - */ - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; - } - // return proposals; -} - -template -static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) { - T *boxes_data = boxes->mutable_data(); - const T *im_info_data = im_info.data(); - T zero(0); - for (int64_t i = 0; i < boxes->numel(); ++i) { - if (i % 4 == 0) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); - } else if (i % 4 == 1) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); - } else if (i % 4 == 2) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); - } else { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); - } - } -} - -template -static inline void FilterBoxes(Tensor *boxes, float min_size, - const Tensor &im_info, Tensor *keep) { - const T *im_info_data = im_info.data(); - T *boxes_data = boxes->mutable_data(); - T im_scale = im_info_data[2]; - keep->Resize({boxes->dims()[0]}); - min_size = std::max(min_size, 1.0f); - int *keep_data = keep->mutable_data(); - - int keep_len = 0; - for (int i = 0; i < boxes->dims()[0]; ++i) { - T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; - T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; - T ws_origin_scale = - (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; - T hs_origin_scale = - (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; - T x_ctr = boxes_data[4 * i] + ws / 2; - T y_ctr = boxes_data[4 * i + 1] + hs / 2; - if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && - x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { - keep_data[keep_len++] = i; - } - } - keep->Resize({keep_len}); -} - -template -static inline std::vector> GetSortedScoreIndex( - const std::vector &scores) { - std::vector> sorted_indices; - sorted_indices.reserve(scores.size()); - for (size_t i = 0; i < scores.size(); ++i) { - sorted_indices.emplace_back(scores[i], i); - } - // Sort the score pair according to the scores in descending order - std::stable_sort(sorted_indices.begin(), sorted_indices.end(), - [](const std::pair &a, const std::pair &b) { - return a.first < b.first; - }); - return sorted_indices; -} - -template -static inline T BBoxArea(const T *box, bool normalized) { - if (box[2] < box[0] || box[3] < box[1]) { - // If coordinate values are is invalid - // (e.g. xmax < xmin or ymax < ymin), return 0. - return static_cast(0.); - } else { - const T w = box[2] - box[0]; - const T h = box[3] - box[1]; - if (normalized) { - return w * h; - } else { - // If coordinate values are not within range [0, 1]. - return (w + 1) * (h + 1); - } - } -} - -template -static inline Tensor VectorToTensor(const std::vector &selected_indices, - int selected_num) { - Tensor keep_nms; - keep_nms.Resize({selected_num}); - auto *keep_data = keep_nms.mutable_data(); - for (int i = 0; i < selected_num; ++i) { - keep_data[i] = selected_indices[i]; - } - return keep_nms; -} - -template -static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) { - if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || - box2[3] < box1[1]) { - return static_cast(0.); - } else { - const T inter_xmin = std::max(box1[0], box2[0]); - const T inter_ymin = std::max(box1[1], box2[1]); - const T inter_xmax = std::min(box1[2], box2[2]); - const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1); - const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1); - const T inter_area = inter_w * inter_h; - const T bbox1_area = BBoxArea(box1, normalized); - const T bbox2_area = BBoxArea(box2, normalized); - return inter_area / (bbox1_area + bbox2_area - inter_area); - } -} - -template -static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold, - float eta, int post_nms_num = 100) { - int64_t num_boxes = bbox->dims()[0]; - // 4: [xmin ymin xmax ymax] - int64_t box_size = bbox->dims()[1]; - - std::vector scores_data(num_boxes); - std::copy_n(scores->data(), num_boxes, scores_data.begin()); - std::vector> sorted_indices = - GetSortedScoreIndex(scores_data); - - std::vector selected_indices; - int selected_num = 0; - T adaptive_threshold = nms_threshold; - const T *bbox_data = bbox->data(); - while ((sorted_indices.size() != 0) && (selected_num < post_nms_num)) { - int idx = sorted_indices.back().second; - bool flag = true; - for (int kept_idx : selected_indices) { - if (flag) { - T overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, false); - flag = (overlap <= adaptive_threshold); - } else { - break; - } - } - if (flag) { - selected_indices.push_back(idx); - ++selected_num; - } - sorted_indices.erase(sorted_indices.end() - 1); - if (flag && eta < 1 && adaptive_threshold > 0.5) { - adaptive_threshold *= eta; - } - } - return VectorToTensor(selected_indices, selected_num); -} - -template -std::pair ProposalForOneImage( - const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, - const Tensor &bbox_deltas_slice, // [M, 4] - const Tensor &scores_slice, // [N, 1] - const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n, - float nms_thresh, float min_size, float eta) { - auto *scores_data = scores_slice.data(); - - // Sort index - Tensor index_t; - index_t.Resize({scores_slice.numel()}); - int *index = index_t.mutable_data(); - /*for (int i = 0; i < scores_slice.numel(); ++i) { - index[i] = i; - }*/ - std::memcpy(index, score_index.data(), - scores_slice.numel() * sizeof(int)); - - auto compare = [scores_data](const int64_t &i, const int64_t &j) { - return scores_data[i] > scores_data[j]; - }; - - if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { - std::sort(index, index + scores_slice.numel(), compare); - } else { - std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(), - compare); - index_t.Resize({pre_nms_top_n}); - } - - Tensor scores_sel, bbox_sel, anchor_sel, var_sel; - scores_sel.mutable_data({index_t.numel(), 1}); - bbox_sel.mutable_data({index_t.numel(), 4}); - anchor_sel.mutable_data({index_t.numel(), 4}); - var_sel.mutable_data({index_t.numel(), 4}); - - CPUGather(scores_slice, index_t, &scores_sel); - CPUGather(bbox_deltas_slice, index_t, &bbox_sel); - CPUGather(anchors, index_t, &anchor_sel); - Tensor proposals; - proposals.mutable_data({index_t.numel(), 4}); - BoxCoder(&anchor_sel, &bbox_sel, nullptr, &proposals); - - ClipTiledBoxes(im_info_slice, &proposals); - - Tensor keep; - FilterBoxes(&proposals, min_size, im_info_slice, &keep); - - Tensor scores_filter; - bbox_sel.mutable_data({keep.numel(), 4}); - scores_filter.mutable_data({keep.numel(), 1}); - - CPUGather(proposals, keep, &bbox_sel); - CPUGather(scores_sel, keep, &scores_filter); - if (nms_thresh <= 0) { - return std::make_pair(bbox_sel, scores_filter); - } - - // Tensor keep_nms = NMS(&bbox_sel, &scores_filter, nms_thresh, eta); - Tensor keep_nms = - NMS(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n); - - if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { - keep_nms.Resize({post_nms_top_n}); - } - - proposals.mutable_data({keep_nms.numel(), 4}); // original - scores_sel.mutable_data({keep_nms.numel(), 1}); // original - - // proposals.mutable_data({post_nms_top_n, 4}); // wong - // scores_sel.mutable_data({post_nms_top_n, 1}); // wong - CPUGather(bbox_sel, keep_nms, &proposals); - CPUGather(scores_filter, keep_nms, &scores_sel); - return std::make_pair(proposals, scores_sel); -} - -template <> -void ProposalKernel::Compute(const ProposalParam ¶m) { - auto input_score = param.scores_; - auto input_score_data = input_score->data(); - auto input_score_data_tmp = input_score->data(); - uint32_t score_n, score_height, score_width, score_channels; - - auto input_bbox = param.bbox_deltas_; - auto input_bbox_data = input_bbox->data(); - auto input_bbox_data_tmp = input_bbox->data(); - uint32_t bbox_n, bbox_height, bbox_width, bbox_channels; - - score_n = (uint32_t)(input_score->dims()[0]); - score_channels = (uint32_t)(input_score->dims()[1]); - score_height = (uint32_t)(input_score->dims()[2]); - score_width = (uint32_t)(input_score->dims()[3]); - - bbox_n = (uint32_t)(input_bbox->dims()[0]); - bbox_channels = (uint32_t)(input_bbox->dims()[1]); - bbox_height = (uint32_t)(input_bbox->dims()[2]); - bbox_width = (uint32_t)(input_bbox->dims()[3]); - - std::shared_ptr score_tmp = std::make_shared(); - score_tmp->Resize(param.scores_->dims()); - score_tmp->mutable_data(); - - std::shared_ptr bbox_tmp = std::make_shared(); - bbox_tmp->Resize(param.bbox_deltas_->dims()); - bbox_tmp->mutable_data(); - - auto score_tmp_data = score_tmp->data(); - auto bbox_tmp_data = bbox_tmp->data(); - int64_t amount_per_side = score_width * score_height; - int idx = 0; - fpga::fpga_invalidate( - input_score_data_tmp, - score_height * score_width * score_channels * sizeof(half)); - for (int h = 0; h < score_height; h++) { - for (int w = 0; w < score_width; w++) { - for (int c = 0; c < score_channels; c++) { - idx++; - // DLOG << "wong input_score: "<< - // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]); - *(score_tmp_data + c * amount_per_side + score_width * h + w) = - (*(input_score_data_tmp++)); - } - } - } - amount_per_side = bbox_width * bbox_height; - fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width * - bbox_channels * sizeof(half)); - for (int h = 0; h < bbox_height; h++) { - for (int w = 0; w < bbox_width; w++) { - for (int c = 0; c < bbox_channels; c++) { - idx++; - // DLOG << "wong input_score: "<< - // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]); - *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) = - (*(input_bbox_data_tmp++)); - } - } - } - struct paddle_mobile::fpga::BypassArgs temp_score_arg; - struct paddle_mobile::fpga::BypassArgs temp_bbox_arg; - temp_score_arg = param.score_arg; - temp_score_arg.image.address = score_tmp->data(); - - temp_bbox_arg = param.bbox_arg; - temp_bbox_arg.image.address = bbox_tmp->data(); - auto score_tensor = param.float_score.get(); - fpga::PerformBypass(param.score_arg); - fpga::fpga_invalidate(score_tensor->data(), - score_tensor->numel() * sizeof(float)); - - auto bbox_tensor = param.float_bbox.get(); - fpga::PerformBypass(param.bbox_arg); - fpga::fpga_invalidate(bbox_tensor->data(), - bbox_tensor->numel() * sizeof(float)); - - auto *scores = param.float_score.get(); - auto *bbox_deltas = param.float_bbox.get(); - auto *im_info = param.im_info_; - auto anchors = *param.anchors_; - auto variances = *param.variances_; - - auto *rpn_rois = param.rpn_rois_; - auto *rpn_roi_probs = param.rpn_probs_; - - auto score_index = *(param.score_index_.get()); - - int pre_nms_top_n = param.pre_nms_topn_; - int post_nms_top_n = param.post_nms_topn_; - // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_; - - float nms_thresh = param.nms_thresh_ / 2.0f; - float min_size = param.min_size_; - float eta = param.eta_; - - auto &scores_dim = scores->dims(); - int64_t num = scores_dim[0]; - int64_t c_score = scores_dim[1]; - int64_t h_score = scores_dim[2]; - int64_t w_score = scores_dim[3]; - - auto &bbox_dim = bbox_deltas->dims(); - int64_t c_bbox = bbox_dim[1]; - int64_t h_bbox = bbox_dim[2]; - int64_t w_bbox = bbox_dim[3]; - - // - rpn_rois->mutable_data({bbox_deltas->numel(), 4}); - rpn_roi_probs->mutable_data({scores->numel(), 1}); - - framework::LoD lod; - lod.resize(1); - auto &lod0 = lod[0]; - lod0.push_back(0); - anchors.Resize({anchors.numel(), 4}); - variances.Resize({variances.numel(), 4}); - - int64_t num_proposals = 0; - for (int64_t i = 0; i < num; ++i) { - Tensor im_info_slice = im_info->Slice(i, i + 1); - Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1); - Tensor scores_slice = (*score_tensor).Slice(i, i + 1); - - bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4}); - scores_slice.Resize({h_score * w_score * c_score, 1}); - - std::pair tensor_pair = ProposalForOneImage( - im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, - score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); - Tensor &proposals = tensor_pair.first; - Tensor &scores = tensor_pair.second; - - AppendProposals(rpn_rois, 4 * num_proposals, proposals); - AppendProposals(rpn_roi_probs, num_proposals, scores); - num_proposals += proposals.dims()[0]; - lod0.push_back(num_proposals); - } - rpn_rois->set_lod(lod); - rpn_roi_probs->set_lod(lod); - rpn_rois->Resize({num_proposals, 4}); - rpn_roi_probs->Resize({num_proposals, 1}); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PROPOSAL_OP diff --git a/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp deleted file mode 100644 index 7e0852ca4b..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp +++ /dev/null @@ -1,284 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PSROI_POOL_OP - -#include -#include -#include "operators/kernel/detection_kernel.h" - -#include "fpga/V1/api.h" -#include "fpga/V1/image.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { - auto dims = param->input_x_->dims(); - PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, - "data not aligned"); - - param->float_input = std::make_shared(); - param->float_input->mutable_data(param->input_x_->dims()); - // param->float_output = std::make_shared(); - - auto input = param->input_x_; - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_HWC; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_input->mutable_data(); - args.output.scale_address = param->float_input->scale; - param->input_arg = args; - - auto* rois = param->input_rois_; - int rois_num = rois->dims()[0]; - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, param->output_->dims()[1], param->output_->dims()[2], - param->output_->dims()[3]}); - param->output_->Resize(dims_out_new); - // fpga::format_fp16_ofm(param->output_); - - param->output_->mutable_data(dims_out_new); - // auto output = param->float_output.get(); - // param->output_ = output; - /* args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = output->data(); - args.image.height = (uint32_t)output->dims()[2]; - args.image.width = (uint32_t)output->dims()[3]; - args.image.channels = (uint32_t)output->dims()[1] ; - args.output.address = param->output_->mutable_data(); - args.output.scale_address = param->output_->scale; - param->output_arg = args;*/ - - return true; -} - -/* - template - void PSROIPoolingForward( - const Dtype* bottom_data, - const int height, const int width, const int input_channel, - Dtype* top_data, - const int pooled_height, const int pooled_width, const int output_channel, - const Dtype* bottom_rois, - const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h, - const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind) - { - - int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw)* Bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); - - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - float32x4_t sum_pixels_low_c= vdupq_n_f32(0); - float32x4_t sum_pixels_high_c= vdupq_n_f32(0); - - if(!is_empty){ - Dtype bin_area = (hend - hstart) * (wend - wstart); - float rev_bin_area = 1 / bin_area; - float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area); - //static_cast(bin_area) float pixels_c[output_channel]; - - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int pixel_offset = (h * width + w) * input_channel; - for(int output_c = 0; output_c < output_channel; output_c++){ - int input_channel_offset = output_c * pooled_height * - pooled_width; int input_bias = pixel_offset + input_channel_offset + ph * - pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias]; - } - float32x4_t pixel_low_c = vld1q_f32(pixels_c); - float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4); - sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c); - sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c); - } - } - sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area); - sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area); - } - - int output_index_base = (ph * pooled_width + pw) * output_channel; - top_data += output_index_base; - vst1q_f32(top_data, sum_pixels_low_c); - top_data += 4; - vst1q_f32(top_data, sum_pixels_high_c); - }*/ - -template -void PSROIPoolingForward(const Dtype* bottom_data, const int height, - const int width, const int input_channel, - Dtype* top_data, const int pooled_height, - const int pooled_width, const int output_channel, - const Dtype* bottom_rois, const Dtype Bin_size_h, - const Dtype Bin_size_w, const Dtype roi_start_h, - const Dtype roi_start_w, const int pw, const int ph, - const int roi_batch_ind) { - int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * Bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - float sum_pixels_c[output_channel] = {0}; - float pixels_c[output_channel] = {0}; - if (!is_empty) { - Dtype bin_area = (hend - hstart) * (wend - wstart); - float rec_bin_area = 1 / bin_area; - - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int pixel_offset = (h * width + w) * input_channel; - for (int output_c = 0; output_c < output_channel; output_c++) { - int input_channel_offset = output_c * pooled_height * pooled_width; - int input_bias = - pixel_offset + input_channel_offset + ph * pooled_width + pw; - pixels_c[output_c] = bottom_data[input_bias]; - } - - for (int output_c = 0; output_c < output_channel; output_c++) { - sum_pixels_c[output_c] += pixels_c[output_c]; - } - } - } - for (int output_c = 0; output_c < output_channel; output_c++) { - sum_pixels_c[output_c] *= rec_bin_area; - } - } - - int output_index_base = (ph * pooled_width + pw) * output_channel; - top_data += output_index_base; - memcpy(top_data, sum_pixels_c, output_channel * 4); -} - -template <> -void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { - auto input_tensor = param.float_input.get(); - fpga::PerformBypass(param.input_arg); - fpga::fpga_invalidate(input_tensor->data(), - input_tensor->numel() * sizeof(float)); - - auto* in = input_tensor; - auto* rois = param.input_rois_; - auto* out = param.output_; // param.float_output.get(); - - auto pooled_height = param.pooled_height_; - auto pooled_width = param.pooled_width_; - auto spatial_scale = param.spatial_scale_; - auto output_channels = param.output_channels_; - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto data_nhwc = in->mutable_data(); - - // fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), - (param.output_)->dims()[3]}); - - (param.output_)->Resize(dims_out_new); - - const float* input_data = data_nhwc; // in->data(); - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - auto rois_batch_id_data = rois_batch_id_list.mutable_data(); - - PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_MOBILE_ENFORCE( - rois_batch_size == batch_size, - "the rois_batch_size and input(X) batch_size should be the same."); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num, - "the rois_num from input and lod must be the same"); - - PADDLE_MOBILE_ENFORCE( - input_channels == output_channels * pooled_height * pooled_width, - "the channels of input X should equal the product of " - "output_channels x pooled_height x pooled_width"); - - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - auto output_data = out->mutable_data(); - auto input_rois = rois->data(); - - for (int n = 0; n < rois_num; ++n) { - auto offset_input_rois = input_rois + n * 4; - auto offset_output_data = - output_data + pooled_height * pooled_width * output_channels * n; - - auto roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - auto roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - auto roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - auto roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small rois to be 1 x 1 - auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0 - auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f); - - // Compute bin size w and h at input feature map - auto bin_size_h = roi_height / static_cast(pooled_height); - auto bin_size_w = roi_width / static_cast(pooled_width); - - int roi_batch_ind = rois_batch_id_data[n]; - - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - PSROIPoolingForward(input_data, height, width, input_channels, - offset_output_data, pooled_height, - pooled_width, output_channels, input_rois, - bin_size_h, bin_size_w, roi_start_h, - roi_start_w, pw, ph, roi_batch_ind); - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PSROI_POOL_OP diff --git a/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp deleted file mode 100644 index 75dda4bf6d..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RELU_OP - -#include "operators/kernel/activation_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ReluKernel::Init(ReluParam *param) { - param->Out()->ShareDataWith(*param->InputX()); - return true; -} - -template <> -void ReluKernel::Compute(const ReluParam ¶m) { - PADDLE_MOBILE_ENFORCE(0, "relu as a single op is wrong"); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp deleted file mode 100644 index 647ecb5a65..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE2_OP - -#include "operators/kernel/reshape2_kernel.h" -#include "framework/ddim.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Reshape2Kernel::Init(Reshape2Param *param) { - auto input = const_cast(param->InputX()); - auto output = param->Out(); - auto shape = param->Shape(); - - auto num_in = framework::product(input->dims()); - auto num_shape = framework::product(framework::make_ddim(shape)); - PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported"); - - for (int i = 0; i < shape.size(); i++) { - if (shape[i] == -1) { - shape[i] = static_cast(-num_in / num_shape); - break; - } - } - output->Resize(framework::make_ddim(shape)); - output->set_type(input->type()); - fpga::format_ofm(output); - DLOG << "input: " << input; - DLOG << "output: " << output; - - return true; -} - -void reshape(LoDTensor *input, LoDTensor *output) { - // Subscript r means after reshape - - auto input_ptr = input->data(); - auto output_ptr = output->data(); - output->scale[0] = input->scale[0]; - output->scale[1] = input->scale[1]; - - auto C = static_cast(input->dims()[1]); - auto H = static_cast(input->dims()[2]); - auto W = static_cast(input->dims()[3]); - auto Cr = static_cast(output->dims()[1]); - auto Hr = static_cast(output->dims()[2]); - auto Wr = static_cast(output->dims()[3]); - PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match"); - auto WC = W * C; - auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT); - auto HW = H * W; - auto WCr = Wr * Cr; - auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT); - auto HWr = Hr * Wr; - - fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half)); - - int offset_align = 0; - int offset_r = 0, offset_align_r = 0; - int cr = 0, hr = 0, wr = 0; - - for (int h = 0; h < H; h++) { - int offset0 = h * WC_align; - for (int w = 0; w < W; w++) { - int offset1 = w * C + offset0; - for (int c = 0; c < C; c++) { - offset_align = offset1 + c; - offset_r = c * HW + h * W + w; - cr = offset_r / HWr; - hr = offset_r % HWr / Wr; - wr = offset_r % Wr; - offset_align_r = hr * WCr_align + wr * Cr + cr; - output_ptr[offset_align_r] = input_ptr[offset_align]; - } - } - } - - fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half)); -} - -template <> -void Reshape2Kernel::Compute(const Reshape2Param ¶m) { - auto input = const_cast(param.InputX()); - auto output = param.Out(); - auto shape = param.Shape(); - - auto num_in = framework::product(input->dims()); - auto num_shape = framework::product(framework::make_ddim(shape)); - PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported"); - - for (int i = 0; i < shape.size(); i++) { - if (shape[i] == -1) { - shape[i] = static_cast(-num_in / num_shape); - break; - } - } - output->Resize(framework::make_ddim(shape)); - if (output->dims() == input->dims()) { - DLOG << "No need to reshape"; - output->ShareDataWith(*input); - framework::LoD lod = input->lod(); - output->set_lod(lod); - return; - } - - reshape(input, output); - // -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp deleted file mode 100644 index 5e01bb74ba..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE_OP - -#include "operators/kernel/reshape_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ReshapeKernel::Init(ReshapeParam *param) { - param->Out()->ShareDataWith(*param->InputX()); - const int in_n = param->InputX()->dims()[0]; - const int in_c = param->InputX()->dims()[1]; - const int in_h = param->InputX()->dims()[2]; - const int in_w = param->InputX()->dims()[3]; - auto out = param->Out(); - out->Resize(framework::make_ddim({in_n, in_c * in_h * in_w})); - return true; -} - -template <> -void ReshapeKernel::Compute(const ReshapeParam ¶m) {} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp deleted file mode 100644 index ec8d19db80..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp +++ /dev/null @@ -1,296 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ROIALIGN_POOL_OP - -#include -#include -#include "operators/kernel/detection_kernel.h" - -#include "fpga/V1/api.h" -#include "fpga/V1/image.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool RoiAlignPoolKernel::Init(RoiAlignPoolParam* param) { - auto dims = param->input_x_->dims(); - PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, - "data not aligned"); - - param->float_input = std::make_shared(); - param->float_input->mutable_data(param->input_x_->dims()); - - auto input = param->input_x_; - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_HWC; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_input->mutable_data(); - args.output.scale_address = param->float_input->scale; - param->input_arg = args; - - auto* rois = param->input_rois_; - int rois_num = rois->dims()[0]; - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, param->output_->dims()[1], param->output_->dims()[2], - param->output_->dims()[3]}); - param->output_->Resize(dims_out_new); - - param->output_->mutable_data(dims_out_new); - - return true; -} - -template -struct PreCalc { - int pos1; - int pos2; - int pos3; - int pos4; - T w1; - T w2; - T w3; - T w4; -}; - -template -void pre_calc_for_bilinear_interpolate( - const int height, const int width, const int pooled_height, - const int pooled_width, const int iy_upper, const int ix_upper, - T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, - int roi_bin_grid_h, int roi_bin_grid_w, - std::vector>& pre_calc) { // NOLINT - int pre_calc_index = 0; - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - for (int iy = 0; iy < iy_upper; iy++) { - const T yy = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 - for (int ix = 0; ix < ix_upper; ix++) { - const T xx = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - - T x = xx; - T y = yy; - // deal with: inverse elements are out of feature map boundary - if (y < -1.0 || y > height || x < -1.0 || x > width) { - // empty - PreCalc pc; - pc.pos1 = 0; - pc.pos2 = 0; - pc.pos3 = 0; - pc.pos4 = 0; - pc.w1 = 0; - pc.w2 = 0; - pc.w3 = 0; - pc.w4 = 0; - pre_calc[pre_calc_index] = pc; - pre_calc_index += 1; - continue; - } - - if (y <= 0) { - y = 0; - } - if (x <= 0) { - x = 0; - } - - int y_low = static_cast(y); - int x_low = static_cast(x); - int y_high; - int x_high; - - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = (T)y_low; - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = (T)x_low; - } else { - x_high = x_low + 1; - } - - T ly = y - y_low; - T lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - // save weights and indeces - PreCalc pc; - pc.pos1 = y_low * width + x_low; - pc.pos2 = y_low * width + x_high; - pc.pos3 = y_high * width + x_low; - pc.pos4 = y_high * width + x_high; - pc.w1 = w1; - pc.w2 = w2; - pc.w3 = w3; - pc.w4 = w4; - pre_calc[pre_calc_index] = pc; - - pre_calc_index += 1; - } - } - } - } -} - -template -void ROIAlignForward(const int nthreads, const T* bottom_data, - const T& spatial_scale, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int sampling_ratio, - const T* bottom_rois, T* top_data) { - int n_rois = nthreads / channels / pooled_width / pooled_height; - - for (int n = 0; n < n_rois; n++) { - int index_n = n * channels * pooled_width * pooled_height; - - // roi could have 4 or 5 columns - const T* offset_bottom_rois = bottom_rois + n * 4; - int roi_batch_ind = 0; - // if (roi_cols == 5) { - // roi_batch_ind = offset_bottom_rois[0]; - // offset_bottom_rois++; - // } - - // Do not using rounding; this implementation detail is critical - T roi_start_w = offset_bottom_rois[0] * spatial_scale; - T roi_start_h = offset_bottom_rois[1] * spatial_scale; - T roi_end_w = offset_bottom_rois[2] * spatial_scale; - T roi_end_h = offset_bottom_rois[3] * spatial_scale; - // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); - // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); - // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); - // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); - - // Force malformed ROIs to be 1x1 - T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); - T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - // We use roi_bin_grid to sample the grid and mimic integral - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); // e.g., = 2 - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - - // We do average (integral) pooling inside a bin - const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 - - // we want to precalculate indeces and weights shared by all chanels, - // this is the key point of optimiation - std::vector> pre_calc(roi_bin_grid_h * roi_bin_grid_w * - pooled_width * pooled_height); - pre_calc_for_bilinear_interpolate( - height, width, pooled_height, pooled_width, roi_bin_grid_h, - roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, - roi_bin_grid_h, roi_bin_grid_w, pre_calc); - - for (int c = 0; c < channels; c++) { - int index_n_c = index_n + c * pooled_width * pooled_height; - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; - int pre_calc_index = 0; - - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - int index = index_n_c + ph * pooled_width + pw; - - T output_val = 0.; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - PreCalc pc = pre_calc[pre_calc_index]; - output_val += pc.w1 * offset_bottom_data[pc.pos1] + - pc.w2 * offset_bottom_data[pc.pos2] + - pc.w3 * offset_bottom_data[pc.pos3] + - pc.w4 * offset_bottom_data[pc.pos4]; - - pre_calc_index += 1; - } - } - output_val /= count; - - top_data[index] = output_val; - } // for pw - } // for ph - } // for c - } // for n -} - -template <> -void RoiAlignPoolKernel::Compute( - const RoiAlignPoolParam& param) { - auto input_tensor = param.float_input.get(); - fpga::PerformBypass(param.input_arg); - fpga::fpga_invalidate(input_tensor->data(), - input_tensor->numel() * sizeof(float)); - - auto* in = input_tensor; - auto* rois = param.input_rois_; - auto* out = param.output_; // param.float_output.get(); - - auto pooled_height = param.pooled_height_; - auto pooled_width = param.pooled_width_; - auto spatial_scale = param.spatial_scale_; - auto sampe_ratio = param.sampling_ratio_; - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto data_nhwc = in->mutable_data(); - - fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), - (param.output_)->dims()[3]}); - (param.output_)->Resize(dims_out_new); - - const int index = input_channels * pooled_height * pooled_width * rois_num; - auto rois_data = rois->data(); - auto top_data = param.output_->mutable_data(); - for (int i = 0; i < index; ++i) { - ROIAlignForward(index, data_nhwc, spatial_scale, input_channels, - height, width, pooled_height, pooled_width, - sampe_ratio, rois_data, top_data); - } - - fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height, - pooled_width, rois_num); - out->reset_data_ptr(top_data); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ROIALIGN_POOL_OP diff --git a/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp deleted file mode 100644 index 8fa6feda7f..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SIGMOID_OP - -#include "operators/kernel/activation_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SigmoidKernel::Init(SigmoidParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::SIGMOID; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); - auto out = param->Out(); - fpga::format_fp16_ofm(out); - - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = input_ptr; - args.image.height = 1; - args.image.width = 1; - args.image.channels = input->fpga_data_num; - args.output.address = out->data(); - args.output.scale_address = out->scale; - args.output.activation.activation_type = activation_enable; - args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; - param->SetFpgaArgs(args); - return true; -} -template <> -void SigmoidKernel::Compute(const SigmoidParam ¶m) { - fpga::PerformBypass(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp deleted file mode 100644 index 2fd6ef542e..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SLICE_OP - -#include "operators/kernel/slice_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SliceKernel::Init(SliceParam* param) { - auto output = param->output_; - fpga::format_fp16_ofm(output); - DLOG << "input: " << param->input_; - DLOG << "output: " << param->output_; - if (param->input_->type() != type_id()) { - DLOG << "wrong type"; - } - return true; -} -template <> -void SliceKernel::Compute(const SliceParam& param) { - // Only support slicing in channel dimension - // Only support half data - // W must be aligned to 16 - - auto input = param.input_; - auto output = param.output_; - int HW = input->dims()[2] * input->dims()[3]; - int channel = input->dims()[1]; - auto input_ptr = input->data(); - auto output_ptr = output->data(); - - output->scale[0] = input->scale[0]; - output->scale[1] = input->scale[1]; - - int start = param.starts_[0], end = param.ends_[0]; - start = start < 0 ? start + channel : start; - end = end < 0 ? end + channel : end; - start = start > channel ? channel : start; - end = end > channel ? channel : end; - int len = end - start; - size_t size = len * sizeof(half); - - for (int i = 0; i < HW; i++) { - memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); - } -} -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp deleted file mode 100644 index ac7a7bdc77..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#include "operators/kernel/softmax_kernel.h" -#include "operators/kernel/central-arm-func/softmax_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SoftmaxKernel::Init(SoftmaxParam *param) { - auto input = const_cast(param->InputX()); - auto dims = framework::vectorize(input->dims()); - half *input_ptr; - auto out = param->Out(); - if (input->type() == type_id()) { - out->Resize(framework::make_ddim(dims)); - out->mutable_data(framework::make_ddim(dims)); - } else { - input_ptr = input->data(); - } - - auto float_input = new LoDTensor; - - int input_n = 1, input_c = 1, input_h = 1, input_w = 1; - if (dims.size() == 4) { - input_h = dims[1]; - input_w = dims[2]; - input_c = dims[3]; - if (input_c == 1) { // This input is generated by FC op, dims = [N C 1 1] - PADDLE_MOBILE_ENFORCE(input_w == 1, "Softmax input must come from FC op"); - input_c = dims[1]; - input_h = 1; - } - } else if (dims.size() == 2) { - input_c = dims[1]; - } - input->Resize(framework::make_ddim(dims)); - float_input->Resize(framework::make_ddim(dims)); - - if (input_c == 2 && input->type() == type_id()) { // Use FPGA - fpga::format_fp16_ofm(out); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_CHW; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = input_ptr; - args.image.height = input_h; - args.image.width = input_w; - args.image.channels = input_c; - args.output.address = out->data(); - args.output.scale_address = out->scale; - args.output.activation.activation_type = fpga::SOFTMAX; - param->SetFpgaArgs(args); - } else { // Use CPU - out->Resize(framework::make_ddim(dims)); - out->mutable_data(framework::make_ddim(dims)); - float_input->init(type_id().hash_code()); - float_input->mutable_data(framework::make_ddim(dims)); - fpga::format_fp32_ofm(float_input); - fpga::format_fp32_ofm(out); - - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_CHW; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input_ptr; - args.image.height = input_h; - args.image.width = input_w; - args.image.channels = input_c; - args.output.address = float_input->data(); - args.output.scale_address = float_input->scale; - param->SetFloatInput(float_input); - param->SetFpgaArgs(args); - } - - return true; -} - -template <> -void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { - auto *in_x = (param.InputX()); - auto dims = in_x->dims(); - auto n = 1; - auto h = 1; - auto w = 1; - auto c = 1; - if (dims.size() == 4) { - h = dims[1]; - w = dims[2]; - c = dims[3]; - if (c == 1) { // This input is generated by FC op, dims = [N C 1 1] - PADDLE_MOBILE_ENFORCE(w == 1, "Softmax input must come from FC op"); - c = dims[1]; - h = 1; - } - } else if (dims.size() == 2) { - c = dims[1]; - } - if (in_x->type() == type_id()) { - fpga::PerformBypass(param.FpgaArgs()); - if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { - Tensor *out = param.Out(); - Tensor *in_x2 = param.FloatInput(); - - fpga::fpga_invalidate(in_x2->data(), - in_x2->numel() * sizeof(float)); - math::SoftmaxFuntor()(in_x2, out); - fpga::fpga_flush(out->data(), out->memory_size()); - } - } else { - if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { - Tensor *out = param.Out(); - out->Resize({n, h, w, c}); - math::SoftmaxFuntor()(in_x, out); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp deleted file mode 100644 index 584cb41fb3..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SPLIT_OP - -#include "operators/kernel/split_kernel.h" - -namespace paddle_mobile { -namespace operators { -template <> -bool SplitKernel::Init(SplitParam *param) { - auto *in = const_cast(param->InputX()); - auto outs = param->Outs(); - auto sections = param->Sections(); - int axis = param->Axis(); - PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension"); - PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(), - "Output number should be equal to section number"); - auto image_num = (uint32_t)outs.size(); - auto images_out = - reinterpret_cast(fpga::fpga_malloc(image_num * sizeof(void *))); - auto scales_out = reinterpret_cast( - fpga::fpga_malloc(image_num * sizeof(float *))); - auto out_channels = reinterpret_cast( - fpga::fpga_malloc(image_num * sizeof(uint32_t))); - DLOG << "input: " << in; - for (int i = 0; i < image_num; i++) { - fpga::format_fp16_ofm(outs[i]); - DLOG << "output: " << outs[i]; - images_out[i] = outs[i]->mutable_data(); - scales_out[i] = outs[i]->scale; - out_channels[i] = (uint32_t)sections[i]; - } - - auto deleter = [](void *p) { fpga::fpga_free(p); }; - - fpga::SplitArgs arg = {0}; - arg.image_num = image_num; - arg.image_in = in->data(); - arg.scale_in = in->scale; - arg.images_out = images_out; - arg.scales_out = scales_out; - arg.out_channel_nums = out_channels; - arg.height = (uint32_t)in->dims()[2]; - arg.width = (uint32_t)in->dims()[3]; - arg.vector_split_space.push_back( - std::shared_ptr(reinterpret_cast(images_out), deleter)); - arg.vector_split_space.push_back( - std::shared_ptr(reinterpret_cast(scales_out), deleter)); - arg.vector_split_space.push_back( - std::shared_ptr(reinterpret_cast(out_channels), deleter)); - - param->SetFpgaArgs(arg); - return true; -} -template <> -void SplitKernel::Compute(const SplitParam ¶m) { - fpga::ComputeFPGASplit(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp deleted file mode 100644 index d7bbc5f043..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TANH_OP - -#include "operators/kernel/tanh_kernel.h" -#include -namespace paddle_mobile { -namespace operators { - -template <> -bool TanhKernel::Init(TanhParam *param) { - auto input = const_cast(param->InputX()); - DLOG << "input: " << input; - auto input_ptr = input->data(); - auto float_input = new LoDTensor; - - float_input->mutable_data( - {1, input->dims()[1], input->dims()[2], input->dims()[3]}); - fpga::format_fp32_ofm(float_input); - - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_CHW; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input_ptr; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = float_input->data(); - args.output.scale_address = float_input->scale; - param->SetFloatInput(float_input); - param->SetFpgaArgs(args); - return true; -} - -#define EXP_MAX_INPUT 40.0 -template -T Tanh(const T a) { - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} -template -void tanhFuntor(Tensor *input, Tensor *output) { - auto *input_ptr = input->data(); - auto *output_ptr = output->mutable_data(); - for (int i = 0; i < input->numel(); i++) { - *(output_ptr + i) = Tanh(*(input_ptr + i)); - } -} -template <> -void TanhKernel::Compute(const TanhParam ¶m) { - Tensor *in_x = param.FloatInput(); - Tensor *out = param.Out(); - - fpga::PerformBypass(param.FpgaArgs()); - fpga::fpga_invalidate((void *)in_x->data(), - in_x->numel() * sizeof(float)); - tanhFuntor(in_x, out); - fpga::fpga_flush(out->data(), out->memory_size()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp deleted file mode 100644 index cc839a971e..0000000000 --- a/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef TRANSPOSE2_OP - -#include "operators/kernel/transpose2_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Transpose2Kernel::Init(Transpose2Param *param) { - auto input = param->InputX(); - auto output = param->Out(); - auto axis = param->Axis(); - auto dim = input->dims(); - output->ShareDataWith(*input); - - auto dim_v = vectorize(dim); - - for (int i = 0; i < axis.size(); i++) { - dim_v[i] = dim[axis[i]]; - } - output->Resize(framework::make_ddim(dim_v)); - - DLOG << "input: " << input; - DLOG << "output: " << output; - return true; -} - -template <> -void Transpose2Kernel::Compute( - const Transpose2Param ¶m) { - // Transpose2Compute(param); - auto input = param.InputX(); - auto output = param.Out(); - - output->Resize({input->dims()[0], output->dims()[1], output->dims()[2], - output->dims()[3]}); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp deleted file mode 100755 index 56cc8927f0..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ANCHOR_GENERATOR_OP - -#include -#include -#include -#include -#include "operators/kernel/detection_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool AnchorGeneratorKernel::Init( - AnchorGeneratorParam *param) { - auto input = param->input_; - auto anchors = param->output_anchors_; - auto anchor_ptr = anchors->mutable_data(); - auto stride = param->stride_; - auto feature_width = input->dims()[3], feature_height = input->dims()[2]; - auto stride_width = stride[0], stride_height = stride[1]; - auto offset = param->offset_; - - int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23, - -20, 39, 36, -43, -34, 59, 49, -63, -54, - 79, 69, -96, -77, 112, 93, -137, -118, 153, - 134, -204, -188, 220, 204, -281, -395, 296, 411}; - - int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103, - 0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58, - 0, 0, 34, 68, 0, 0, 24, 28, 0, 0, 19, 46}; - - if (offset > 0.6) { - memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset)); - DLOG << "anchor generator marker"; - } else { - DLOG << "anchor generator rfcn"; - } - int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4); - - // DLOG << "feature_height: " << feature_height; - // DLOG << "feature_width: " << feature_width; - // DLOG << "num_anchors: " << num_anchors; - // DLOG << "stride_width: " << stride_width; - // DLOG << "stride_height: " << stride_height; - - for (int h_idx = 0; h_idx < feature_height; ++h_idx) { - int offset0 = h_idx * feature_width * num_anchors * 4; - for (int w_idx = 0; w_idx < feature_width; ++w_idx) { - int offset1 = w_idx * num_anchors * 4; - for (int idx = 0; idx < num_anchors; idx++) { - int offset = offset0 + offset1 + idx * 4; - anchor_ptr[offset + 0] = - anchors_offset[idx * 4 + 0] + w_idx * stride_width; - anchor_ptr[offset + 1] = - anchors_offset[idx * 4 + 1] + h_idx * stride_height; - anchor_ptr[offset + 2] = - anchors_offset[idx * 4 + 2] + w_idx * stride_width; - anchor_ptr[offset + 3] = - anchors_offset[idx * 4 + 3] + h_idx * stride_height; - } - } - } - return true; -} - -template <> -void AnchorGeneratorKernel::Compute( - const AnchorGeneratorParam ¶m) {} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ANCHOR_GENERATOR_OP diff --git a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp deleted file mode 100755 index 8442eef8b2..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONCAT_OP - -#include "operators/kernel/concat_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConcatKernel::Init(ConcatParam *param) { - auto inputs = param->Inputs(); - auto out = param->Out(); - auto image_num = inputs.size(); - auto images_in = - (int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *)); // NOLINT - auto scales_in = - (float **)fpga::fpga_malloc(image_num * sizeof(float *)); // NOLINT - auto channel_num = - (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); // NOLINT - - auto height = inputs[0]->dims()[2]; - auto width = inputs[0]->dims()[3]; - for (int i = 0; i < image_num; i++) { - auto input = inputs[i]; - PADDLE_MOBILE_ENFORCE( - input->dims()[2] == height && input->dims()[3] == width, - "Image height & width should be unified"); - images_in[i] = input->data(); - channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT - scales_in[i] = input->scale; - } - fpga::format_concat_output(out, height, width, image_num, channel_num); - - fpga::ConcatArgs concatArgs = {0}; - concatArgs.image_num = image_num; - concatArgs.images_in = images_in; - concatArgs.scales_in = scales_in; - concatArgs.image_out = out->data(); - concatArgs.scale_out = out->scale; - concatArgs.channel_num = channel_num; - concatArgs.height = height; - concatArgs.width = width; - - auto deleter = [](void *p) { fpga::fpga_free(p); }; - concatArgs.vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(concatArgs.images_in), deleter)); - concatArgs.vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(concatArgs.scales_in), deleter)); - concatArgs.vector_concat_space.push_back(std::shared_ptr( - reinterpret_cast(concatArgs.channel_num), deleter)); - - param->SetFpgaArgs(concatArgs); - return true; -} - -template <> -void ConcatKernel::Compute(const ConcatParam ¶m) { - ComputeFPGAConcat(param.FpgaArgs()); -} -template class ConcatKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp deleted file mode 100644 index 2e4a8871fc..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBN_OP - -#include "operators/kernel/conv_add_bn_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { - bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - - auto bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && - bias->dims()[0] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - - const int channel = out->dims()[1]; - auto bs_ptr = - reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = - bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0; - bs_ptr[i] = new_bias_ptr[i] * 127.0 / So; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - - delete new_scale; - delete new_bias; - - return true; -} - -template <> -void ConvAddBNKernel::Compute( - const FusionConvAddBNParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp deleted file mode 100644 index 8c65ee0627..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBNRELU_OP - -#include "operators/kernel/conv_add_bn_relu_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { -template <> -bool ConvAddBNReluKernel::Init( - FusionConvAddBNReluParam *param) { - bool relu_enabled = true; - auto input = const_cast(param->Input()); - auto bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - const int groups = param->Groups(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - vector paddings = param->Paddings(); - vector strides = param->Strides(); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && - bias->dims()[0] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - - const int channel = out->dims()[1]; - auto bs_ptr = - reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = - bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0; - bs_ptr[i] = new_bias_ptr[i] * 127.0 / So; - if (groups == channel) { - new_scale_ptr[i] = new_scale_ptr[i] * Si / So; - new_bias_ptr[i] = new_bias_ptr[i] * 127.0f / So; - } - } - - if (groups == channel) { - fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); - fpga::DWconvArgs dwconv_arg = {0}; - fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled, - strides[0], strides[1], paddings[0], paddings[1], - new_bias_ptr); - param->SetFpgaArgs(dwconv_arg); - fpga::fpga_free(bs_ptr); - delete new_scale; - } else { - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, - param->Groups(), strides[0], strides[1], paddings[0], - paddings[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - delete new_scale; - delete new_bias; - } - - return true; -} - -template <> -void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWConv(param.FpgaDwconvArgs()); - } else { - fpga::ComputeFpgaConv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp deleted file mode 100644 index d0a08abdda..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP - -#include "operators/kernel/conv_add_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddKernel::Init(FusionConvAddParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = Si / So * Sf / 127.0; - bs_ptr[i] = bias_ptr[i] * 127.0 / So; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void ConvAddKernel::Compute( - const FusionConvAddParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp deleted file mode 100644 index 508e835b67..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDRELU_OP - -#include "operators/kernel/conv_add_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = Si / So * Sf / 127.0; - bs_ptr[i] = bias_ptr[i] * 127.0 / So; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp deleted file mode 100644 index d3de98705e..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBN_OP - -#include "operators/kernel/conv_bn_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvBNKernel::Init(FusionConvBNParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - const int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // // NOLINT - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0; - bs_ptr[i] = new_bias_ptr[i] * 127.0 / So; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - delete new_scale; - delete new_bias; - return true; -} - -template <> -void ConvBNKernel::Compute(const FusionConvBNParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp deleted file mode 100644 index 9ea962c111..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNRELU_OP - -#include "operators/kernel/conv_bn_relu_kernel.h" -#include -namespace paddle_mobile { -namespace operators { -template <> -bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - const int groups = param->Groups(); - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - const int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0; - bs_ptr[i] = new_bias_ptr[i] * 127.0 / So; - if (groups == channel) { - new_scale_ptr[i] = new_scale_ptr[i] * Si / So; - new_bias_ptr[i] = new_bias_ptr[i] * 127.0 / So; - } - } - if (groups == channel) { - fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); - fpga::DWconvArgs dwconv_arg = {0}; - fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, true, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], - new_bias_ptr); - param->SetFpgaArgs(dwconv_arg); - fpga::fpga_free(bs_ptr); - } else { - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - } - delete new_scale; - delete new_bias; - return true; -} - -template <> -void ConvBNReluKernel::Compute( - const FusionConvBNReluParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWConv(param.FpgaDwconvArgs()); - } else { - fpga::ComputeFpgaConv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp deleted file mode 100644 index 9a003543d5..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#include "operators/kernel/conv_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvKernel::Init(ConvParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - int channel = out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = Si / So * Sf / 127.0; - bs_ptr[i] = 0; - } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void ConvKernel::Compute(const ConvParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp deleted file mode 100644 index c09e1ced8a..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_TRANSPOSE_OP - -#include "operators/kernel/conv_transpose_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvTransposeKernel::Init(ConvTransposeParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = 0; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So; - bs_ptr[i] = 0; - } - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f; - bs_ptr[i] = 0; - } - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - return true; -} - -template <> -void ConvTransposeKernel::Compute( - const ConvTransposeParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp deleted file mode 100644 index 1dcb5d7d41..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDBN_OP - -#include "operators/kernel/deconv_add_bn_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->InputBias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So; - bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So; - } - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f; - bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So; - } - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - return true; -} - -template <> -void DeconvAddBNKernel::Compute( - const FusionDeconvAddBNParam ¶m) { - // fpga::ComputeFpgaDeconv(param.FpgaArgs()); - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp deleted file mode 100644 index 4c8b4ec3c2..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDBNRELU_OP - -#include "operators/kernel/deconv_add_bn_relu_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvAddBNReluKernel::Init( - FusionDeconvAddBNReluParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->InputBias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So; - bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So; - } - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f; - bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So; - } - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - return true; -} - -template <> -void DeconvAddBNReluKernel::Compute( - const FusionDeconvAddBNReluParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp deleted file mode 100644 index 179d58ac99..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADD_OP - -#include "operators/kernel/deconv_add_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = 1; - bs_ptr[i] = bias_ptr[i % (channel)]; - } - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So; - bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So; - } - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f; - bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So; - } - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - - return true; -} - -template <> -void DeconvAddKernel::Compute( - const FusionDeconvAddParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp deleted file mode 100644 index c7e728a169..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVADDRELU_OP - -#include "operators/kernel/deconv_add_relu_kernel.h" -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvAddReluKernel::Init( - FusionDeconvAddReluParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->Bias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So; - bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So; - } - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f; - bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So; - } - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - return true; -} - -template <> -void DeconvAddReluKernel::Compute( - const FusionDeconvAddReluParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp deleted file mode 100644 index 081087b7ad..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_DECONVBNRELU_OP - -#include "operators/kernel/deconv_bn_relu_kernel.h" -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DeconvBNReluKernel::Init( - FusionDeconvBNReluParam *param) { - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - const Tensor *bias = param->InputBias(); - auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); - auto out = param->Output(); - float Si = input->scale[0]; - float So = out->scale[0]; - float Sf = fpga::filter_find_max(filter); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], - "Output channel should be equal to bias number"); - int channel = out->dims()[1]; - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - } - - int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT - sizeof(float)); // NOLINT - if (param->Groups() == channel) { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel] * Si / So; - bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So; - } - } else { - for (int i = 0; i < channel * sub_conv_n; i++) { - bs_ptr[i + sub_conv_n * channel] = - new_scale_ptr[i % channel] * Si / So * Sf / 127.0f; - bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So; - } - } - PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], - "stride_width should be equal to stride_height "); - PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], - "filter width should be equal to filter height "); - PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), - "filter axis should be the multiple of stride axis "); - if (param->Groups() == channel) { - fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), - sub_conv_n); - fpga::DWDeconvArgs DWDeconv_arg = {0}; - fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true, - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(DWDeconv_arg); - } else { - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); - } - delete new_scale; - delete new_bias; - return true; -} - -template <> -void DeconvBNReluKernel::Compute( - const FusionDeconvBNReluParam ¶m) { - if (param.Groups() == param.Output()->dims()[1]) { - fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); - } else { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp deleted file mode 100644 index 8b990d46e0..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DROPOUT_OP - -#include "operators/kernel/dropout_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DropoutKernel::Init(DropoutParam *param) { - param->Out()->ShareDataWith(*param->InputX()); - return true; -} - -template <> -void DropoutKernel::Compute(const DropoutParam ¶m) {} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp deleted file mode 100644 index 54ae3b6712..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef ELEMENTWISEADD_OP -#include "operators/kernel/elementwise_add_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { - auto *input_y = const_cast(param->InputY()); - auto *out = param->Out(); - auto *input_x = const_cast(param->InputX()); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); - fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); - float Si_1 = input_x->scale[0]; - float Si_2 = input_y->scale[0]; - float So = out->scale[0]; - float C1 = Si_1 / So; - float C2 = Si_2 / So; - fpga::EWAddArgs ewaddArgs = {0}; - ewaddArgs.const0 = 1; - ewaddArgs.const1 = 1; - ewaddArgs.relu_enabled = 0; - ewaddArgs.image0.address = input_x_ptr; - ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; - ewaddArgs.image0.scale_address = input_x->scale; - ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; - ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; - ewaddArgs.image0.pad_height = 0; - ewaddArgs.image0.pad_width = 0; - ewaddArgs.image1.address = input_y_ptr; - ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; - ewaddArgs.image1.scale_address = input_y->scale; - ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; - ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; - ewaddArgs.image1.pad_height = 0; - ewaddArgs.image1.pad_width = 0; - ewaddArgs.output.scale_address = out->scale; - ewaddArgs.output.address = out_ptr; - fpga::expand_EW_arg(&ewaddArgs); - param->SetFpgaArgs(ewaddArgs); - return true; -} - -void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) { - int inputc = ewaddArgs.image0.channels; - int inputh = ewaddArgs.image0.height; - int inputw = ewaddArgs.image0.width; - float inScale0 = - (reinterpret_cast(ewaddArgs.image0.scale_address))[0]; - float inScale1 = - (reinterpret_cast(ewaddArgs.image1.scale_address))[0]; - float outScale = - (reinterpret_cast(ewaddArgs.output.scale_address))[0]; - int8_t *inPtr0 = reinterpret_cast(ewaddArgs.image0.address); - int8_t *inPtr1 = reinterpret_cast(ewaddArgs.image1.address); - int8_t *outPtr = reinterpret_cast(ewaddArgs.output.address); - int datasize = inputc * inputh * inputw; - float const0 = inScale0 / outScale; - float const1 = inScale1 / outScale; - fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t)); - fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t)); - for (int i = 0; i < datasize; i++) { - float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1; - int tmpI = static_cast(round(tmpF)); - outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < -127 ? -127 : tmpI))); - } - fpga::fpga_flush(outPtr, datasize * sizeof(int8_t)); -} -template <> -void ElementwiseAddKernel::Compute( - const ElementwiseAddParam ¶m) { - // fpga::ComputeFpgaEWAdd(param.FpgaArgs()); - ComputeCPUEWAdd(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp deleted file mode 100644 index c406a22d56..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_ELEMENTWISEADDRELU_OP -#include "operators/kernel/elementwise_add_relu_kernel.h" -#include - -namespace paddle_mobile { -namespace operators { - -template <> -bool ElementwiseAddReluKernel::Init( - ElementwiseAddReluParam *param) { - auto *input_x = const_cast(param->InputX()); - auto *input_y = const_cast(param->InputY()); - auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); - fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); - float Si_1 = input_x->scale[0]; - float Si_2 = input_y->scale[0]; - float So = out->scale[0]; - float C1 = Si_1 / So; - float C2 = Si_2 / So; - fpga::EWAddArgs ewaddArgs = {0}; - ewaddArgs.relu_enabled = 1; - ewaddArgs.const0 = 1; - ewaddArgs.const1 = 1; - ewaddArgs.image0.address = input_x_ptr; - ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; - ewaddArgs.image0.scale_address = input_x->scale; - ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; - ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; - ewaddArgs.image0.pad_height = 0; - ewaddArgs.image0.pad_width = 0; - ewaddArgs.image1.address = input_y_ptr; - ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; - ewaddArgs.image1.scale_address = input_y->scale; - ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; - ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; - ewaddArgs.image1.pad_height = 0; - ewaddArgs.image1.pad_width = 0; - ewaddArgs.output.scale_address = out->scale; - ewaddArgs.output.address = out_ptr; - fpga::expand_EW_arg(&ewaddArgs); - param->SetFpgaArgs(ewaddArgs); - return true; -} - -void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) { - int inputc = ewaddArgs.image0.channels; - int inputh = ewaddArgs.image0.height; - int inputw = ewaddArgs.image0.width; - float inScale0 = - (reinterpret_cast(ewaddArgs.image0.scale_address))[0]; - float inScale1 = - (reinterpret_cast(ewaddArgs.image1.scale_address))[0]; - float outScale = - (reinterpret_cast(ewaddArgs.output.scale_address))[0]; - int8_t *inPtr0 = reinterpret_cast(ewaddArgs.image0.address); - int8_t *inPtr1 = reinterpret_cast(ewaddArgs.image1.address); - int8_t *outPtr = reinterpret_cast(ewaddArgs.output.address); - int datasize = inputc * inputh * inputw; - float const0 = inScale0 / outScale; - float const1 = inScale1 / outScale; - fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t)); - fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t)); - for (int i = 0; i < datasize; i++) { - float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1; - int tmpI = static_cast(round(tmpF)); - outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < 0 ? 0 : tmpI))); - } - fpga::fpga_flush(outPtr, datasize * sizeof(int8_t)); -} - -template <> -void ElementwiseAddReluKernel::Compute( - const ElementwiseAddReluParam ¶m) { - // fpga::ComputeFpgaEWAdd(param.FpgaArgs()); - ComputeCPUEWAddRelu(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp deleted file mode 100644 index d1138d06bb..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ELEMENTWISEMUL_OP - -#include "operators/kernel/elementwise_mul_kernel.h" -#include "operators/math/elementwise_op_function.h" - -namespace paddle_mobile { -namespace operators { - -template -struct MulFunctor { - inline T operator()(T a, T b) const { return a * b; } -}; -template <> -bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { - param->float_input_x.Resize(param->InputX()->dims()); - param->float_input_x.init(type_id().hash_code()); - fpga::format_fp32_ofm(&(param->float_input_x)); - - param->float_out.Resize(param->InputX()->dims()); - param->float_out.init(type_id().hash_code()); - fpga::format_fp32_ofm(&(param->float_out)); - - auto *out = param->Out(); - fpga::format_ofm(out); - return true; -} - -template <> -void ElementwiseMulKernel::Compute( - const ElementwiseMulParam ¶m) { - auto input_x = const_cast(param.InputX()); - auto intput_x_float = const_cast(&(param.float_input_x)); - // auto intput_x_32_ptr = - // const_cast(param.float_input_x.data()); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = input_x->data(); - args.image.channels = (uint32_t)(input_x->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = intput_x_float->data(); - args.output.scale_address = intput_x_float->scale; - fpga::PerformBypass(args); - fpga::fpga_invalidate(args.output.address, - input_x->fpga_data_num * sizeof(float)); - - auto input_y = param.InputY(); - int axis = param.Axis(); - auto out_float = const_cast(&(param.float_out)); - ElementwiseComputeEx, float>( - intput_x_float, input_y, axis, MulFunctor(), out_float); - fpga::fpga_flush(out_float->data(), - input_x->fpga_data_num * sizeof(float)); - - Tensor *Out = param.Out(); - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = out_float->data(); - args.image.channels = (uint32_t)(Out->fpga_data_num); - args.image.height = 1; - args.image.width = 1; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = Out->data(); - args.output.scale_address = Out->scale; - fpga::PerformBypass(args); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp deleted file mode 100644 index b797b3faf8..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/kernel/feed_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FeedKernel::Init(FeedParam *param) { - auto output = param->Out(); - if (output->dims().size() != 4) { - output->init(type_id().hash_code()); - return true; - } - fpga::format_ofm(output); - return true; -} - -template <> -void FeedKernel::Compute(const FeedParam ¶m) { - auto output = param.Out(); - int col = param.Col(); - auto input = const_cast(¶m.InputX()->at(col)); - if (output->dims().size() != 4) { - size_t size = output->numel() * sizeof(float); - auto output_ptr = output->data(); - auto input_ptr = input->data(); - auto external_ptr = reinterpret_cast(input->external_data); - float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - memcpy(output_ptr, p_data, size); - input->external_data = nullptr; - return; - } - fpga::format_image(input); - - auto output_ptr = output->data(); - int channel = output->dims()[1]; - int height = output->dims()[2]; - int width = output->dims()[3]; - int size = fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height; - auto input_ptr = input->data(); - fpga::fpga_invalidate(input_ptr, size * sizeof(int8_t)); - memcpy(output_ptr, input_ptr, size * sizeof(int8_t)); - - fpga::fpga_flush(output_ptr, - fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height * - sizeof(int8_t)); -} -template class FeedKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp deleted file mode 100644 index c6b8f9e852..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "operators/kernel/fetch_kernel.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool FetchKernel::Init(FetchParam *param) { - auto input = const_cast(param->InputX()); - int col = param->Col(); - DLOG << "col = " << col; - auto output = &(param->Out()->at(col)); - output->init(type_id().hash_code()); - output->mutable_data(input->dims()); - - auto aligned_output = param->aligned_out; - int outC = 1; - int outW = 1; - if (output->dims().size() == 4) { - outC = output->dims()[1]; - outW = output->dims()[3]; - } else { // 2 - outC = output->dims()[1]; - } - int unalignedCW = outC * outW; - int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT); - if (alignedCW != unalignedCW) { - param->aligned_out = std::make_shared(); - param->aligned_out->Resize(input->dims()); - param->aligned_out->init(type_id().hash_code()); - fpga::format_ofm(param->aligned_out.get()); - } - return true; -} -void dealign(float *src, float *dst, int input_c, int input_h, int input_w) { - int alignCW = - paddle_mobile::fpga::align_to_x(input_c * input_w, IMAGE_ALIGNMENT); - int dealignCW = input_c * input_w; - for (int h = 0; h < input_h; ++h) { - auto input_offset = h * alignCW; - auto output_offset = h * dealignCW; - memcpy((dst + output_offset), (src + input_offset), - dealignCW * sizeof(float)); - } -} -template <> -void FetchKernel::Compute(const FetchParam ¶m) { - auto input = const_cast(param.InputX()); - int col = param.Col(); - auto output = ¶m.Out()->at(col); - auto outdata_ptr = const_cast(output->data()); - int outC = 1; - int outH = 1; - int outW = 1; - if (output->dims().size() == 4) { - outC = output->dims()[1]; - outH = output->dims()[2]; - outW = output->dims()[3]; - } else { // 2 - outC = output->dims()[1]; - } - int unalignedCW = outC * outW; - int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT); - if (input->type() == type_id()) { - if ((output->dims().size() != 4) || (unalignedCW == alignedCW)) { - output->ShareDataWith(*input); - } else { - auto input_address = input->data(); - dealign(input_address, outdata_ptr, outC, outH, outW); - fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float)); - } - - return; - } - auto input_address = input->data(); - float Si = input->scale[0]; - - const int num_th = 32; - fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t)); - if (input->fpga_data_num < num_th) { - for (int idx = 0; idx < product(input->dims()); ++idx) { - outdata_ptr[idx] = input_address[idx] / 127.0 * Si; - } - fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float)); - return; - } - - auto aligned_out = param.aligned_out.get(); - if (unalignedCW != alignedCW) { - auto aligned_ptr = aligned_out->data(); - fpga::fpga_invalidate(aligned_ptr, (input->fpga_data_num) * sizeof(float)); - for (int idx = 0; idx < input->fpga_data_num; ++idx) { - aligned_ptr[idx] = input_address[idx] / 127.0 * Si; - } - dealign(aligned_ptr, outdata_ptr, outC, outH, outW); - fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float)); - return; - } - for (int idx = 0; idx < input->fpga_data_num; ++idx) { - outdata_ptr[idx] = input_address[idx] / 127.0 * Si; - } - fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float)); -} -template class FetchKernel; - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp deleted file mode 100644 index 4767b08e73..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_FC_OP - -#include "operators/kernel/fusion_fc_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FusionFcKernel::Init(FusionFcParam *param) { - bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto input_x = const_cast(param->InputX()); - auto filter = const_cast(param->InputY()); - const Tensor *input_z = param->InputZ(); - auto input_z_ptr = input_z->data(); - auto out = param->Out(); - float Si = input_x->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; - float So = out->scale[0]; - - int channel = (uint32_t)out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = Si / So * Sf / 127.0f; - bs_ptr[i] = input_z_ptr[i] * 127.0f / So; - } - int num = (uint32_t)filter->dims()[1]; - int chw = (uint32_t)filter->dims()[0]; - PADDLE_MOBILE_ENFORCE( - chw == input_x->numel(), - "Filter element num should be equal to IFM element num"); - int height = (uint32_t)input_x->dims()[2]; - int width = (uint32_t)input_x->dims()[3]; - int filter_channel = chw / height / width; - - out->Resize(framework::make_ddim({1, channel, 1, 1})); - filter->Resize(framework::make_ddim({num, filter_channel, height, width})); - float max_value = fpga::filter_find_max(filter); - fpga::format_fc_filter(filter, max_value); - - int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); - - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, - 0, 0, bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void FusionFcKernel::Compute(const FusionFcParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp deleted file mode 100644 index 9748327355..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_FCRELU_OP - -#include "operators/kernel/fc_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FusionFcReluKernel::Init(FusionFcReluParam *param) { - bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::LEAKYRELU; - int16_t leaky_relu_negative_slope = 0; - auto input_x = const_cast(param->InputX()); - auto filter = const_cast(param->InputY()); - const Tensor *input_z = param->InputZ(); - auto input_z_ptr = input_z->data(); - auto out = param->Out(); - float Si = input_x->scale[0]; - float Sf = fpga::filter_find_max(filter) / 127; - float So = out->scale[0]; - - int channel = (uint32_t)out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = Si / So * Sf / 127.0f; - bs_ptr[i] = input_z_ptr[i] * 127.0f / So; - } - int num = (uint32_t)filter->dims()[1]; - int chw = (uint32_t)filter->dims()[0]; - PADDLE_MOBILE_ENFORCE( - chw == input_x->numel(), - "Filter element num should be equal to IFM element num"); - int height = (uint32_t)input_x->dims()[2]; - int width = (uint32_t)input_x->dims()[3]; - int filter_channel = chw / height / width; - - out->Resize(framework::make_ddim({1, channel, 1, 1})); - filter->Resize(framework::make_ddim({num, filter_channel, height, width})); - float max_value = fpga::filter_find_max(filter); - fpga::format_fc_filter(filter, max_value); - - int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); - - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, - 0, 0, bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void FusionFcReluKernel::Compute( - const FusionFcReluParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp deleted file mode 100644 index aafc86d888..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef POOL_OP - -#include "operators/kernel/pool_kernel.h" - -class PoolingArgs; -namespace paddle_mobile { -namespace operators { - -template <> -bool PoolKernel::Init(PoolParam *param) { - auto *input = const_cast(param->Input()); - auto *output = param->Output(); - vector ksize = param->Ksize(); - vector strides = param->Strides(); - vector paddings = param->Paddings(); - std::string pooling_type = param->PoolingType(); - - if (input->type() == type_id()) { - int channels = input->dims()[1]; - int height = input->dims()[2]; - int width = input->dims()[3]; - int num = input->dims()[0]; - int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1; - int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1; - framework::DDim dim = - framework::make_ddim({num, channels, out_height, out_width}); - output->mutable_data(dim); - return true; - } - - auto input_ptr = input->data(); - fpga::format_ofm(output); - auto output_ptr = output->mutable_data(); - float Si = input->scale[0]; - float So = output->scale[0]; - - fpga::PoolingArgs poolArgs = {0}; - poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1 - poolArgs.kernel_reciprocal = fpga::fp32_2_fp16( - float(1.0 / (ksize[0] * ksize[1]) * Si / So)); // NOLINT - poolArgs.image.address = input_ptr; - poolArgs.image.channels = (uint32_t)input->dims()[1]; - poolArgs.image.height = (uint32_t)input->dims()[2]; - poolArgs.image.width = (uint32_t)input->dims()[3]; - poolArgs.image.pad_height = (uint32_t)paddings[0]; - poolArgs.image.pad_width = (uint32_t)paddings[1]; - poolArgs.image.scale_address = input->scale; - poolArgs.output.address = output_ptr; - poolArgs.output.scale_address = output->scale; - poolArgs.kernel.height = (uint32_t)ksize[0]; - poolArgs.kernel.width = (uint32_t)ksize[1]; - poolArgs.kernel.stride_h = (uint32_t)strides[0]; - poolArgs.kernel.stride_w = (uint32_t)strides[1]; - param->SetFpgaArgs(poolArgs); - return true; -} - -template <> -void PoolKernel::Compute(const PoolParam ¶m) { - auto *input = const_cast(param.Input()); - - if (input->type() == type_id()) { - auto *output = param.Output(); - auto in = input->data(); - auto N = input->dims()[0]; - output->Resize( - {N, output->dims()[1], output->dims()[2], output->dims()[3]}); - auto len = output->numel(); - auto out = output->mutable_data(); - int C = input->dims()[1], H = input->dims()[2], // N = input->dims()[0], - W = input->dims()[3]; - int HW = H * W, CHW = C * H * W, WC = W * C; - - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - out[n * C + c] = 0; - for (int h = 0; h < H; h++) { - for (int w = 0; w < W; w++) { - out[n * C + c] += in[n * CHW + h * WC + w * C + - c]; // in[n * CHW + c * HW + h * W + w]; // - } - } - out[n * C + c] /= HW; - } - } - return; - } - fpga::ComputeFpgaPool(param.FpgaArgs()); -} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp deleted file mode 100644 index c2f8b55c1e..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp +++ /dev/null @@ -1,452 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PROPOSAL_OP - -#include -#include -#include -#include "operators/kernel/detection_kernel.h" - -namespace paddle_mobile { -namespace operators { - -static const double kBBoxClipDefault = std::log(1000.0 / 16.0); - -template <> -bool ProposalKernel::Init(ProposalParam *param) { - int post_nms_top_n = param->post_nms_topn_; - int64_t batch = param->scores_->dims()[0]; - auto total = post_nms_top_n * batch; - param->rpn_rois_->mutable_data({total, 4}); - param->rpn_probs_->mutable_data({total, 1}); - - param->float_bbox = std::make_shared(); - param->float_bbox->Resize(param->bbox_deltas_->dims()); - param->float_bbox->init(type_id().hash_code()); - fpga::format_fp32_ofm(param->float_bbox.get()); - - auto input = param->scores_; - param->score_index_ = std::make_shared(); - param->score_index_->mutable_data({input->numel()}); - auto score_index = param->score_index_->data(); - for (int i = 0; i < input->numel(); ++i) { - score_index[i] = i; - } - - return true; -} -template -void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) { - PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 || - (index.dims().size() == 2 && index.dims()[1] == 1), - "Dim not correct"); - int64_t index_size = index.dims()[0]; - - auto src_dims = src.dims(); - - const T *p_src = src.data(); - const int *p_index = index.data(); - T *p_output = output->data(); - - // slice size - int slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; - - const size_t slice_bytes = slice_size * sizeof(T); - - for (int64_t i = 0; i < index_size; ++i) { - int index_ = p_index[i]; - memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); - } -} - -void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { - auto *out_data = dst->data(); - auto *to_add_data = src.data(); - size_t size_of_t = framework::SizeOfType(src.type()); - offset *= size_of_t; - std::memcpy( - reinterpret_cast(reinterpret_cast(out_data) + offset), - to_add_data, src.numel() * size_of_t); -} - -template -static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, - Tensor *proposals) { - T *proposals_data = proposals->mutable_data(); - - int64_t row = all_anchors->dims()[0]; - int64_t len = all_anchors->dims()[1]; - - auto *bbox_deltas_data = bbox_deltas->data(); - auto *anchor_data = all_anchors->data(); - - for (int64_t i = 0; i < row; ++i) { - T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; - T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; - - T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; - T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; - - T bbox_center_x = 0, bbox_center_y = 0; - T bbox_width = 0, bbox_height = 0; - - bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; - bbox_center_y = - bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; - bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; - - proposals_data[i * len] = bbox_center_x - bbox_width / 2; - proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; - } -} - -template -static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) { - T *boxes_data = boxes->mutable_data(); - const T *im_info_data = im_info.data(); - T zero(0); - for (int64_t i = 0; i < boxes->numel(); ++i) { - if (i % 4 == 0) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); - } else if (i % 4 == 1) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); - } else if (i % 4 == 2) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); - } else { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); - } - } -} - -template -static inline void FilterBoxes(Tensor *boxes, float min_size, - const Tensor &im_info, Tensor *keep) { - const T *im_info_data = im_info.data(); - T *boxes_data = boxes->mutable_data(); - T im_scale = im_info_data[2]; - keep->Resize({boxes->dims()[0]}); - min_size = std::max(min_size, 1.0f); - int *keep_data = keep->mutable_data(); - - int keep_len = 0; - for (int i = 0; i < boxes->dims()[0]; ++i) { - T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; - T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; - T ws_origin_scale = - (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; - T hs_origin_scale = - (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; - T x_ctr = boxes_data[4 * i] + ws / 2; - T y_ctr = boxes_data[4 * i + 1] + hs / 2; - if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && - x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { - keep_data[keep_len++] = i; - } - } - keep->Resize({keep_len}); -} - -template -static inline std::vector> GetSortedScoreIndex( - const std::vector &scores) { - std::vector> sorted_indices; - sorted_indices.reserve(scores.size()); - for (size_t i = 0; i < scores.size(); ++i) { - sorted_indices.emplace_back(scores[i], i); - } - // Sort the score pair according to the scores in descending order - std::stable_sort(sorted_indices.begin(), sorted_indices.end(), - [](const std::pair &a, const std::pair &b) { - return a.first < b.first; - }); - return sorted_indices; -} - -template -static inline T BBoxArea(const T *box, bool normalized) { - if (box[2] < box[0] || box[3] < box[1]) { - return static_cast(0.); - } else { - const T w = box[2] - box[0]; - const T h = box[3] - box[1]; - if (normalized) { - return w * h; - } else { - // If coordinate values are not within range [0, 1]. - return (w + 1) * (h + 1); - } - } -} - -template -static inline Tensor VectorToTensor(const std::vector &selected_indices, - int selected_num) { - Tensor keep_nms; - keep_nms.Resize({selected_num}); - auto *keep_data = keep_nms.mutable_data(); - for (int i = 0; i < selected_num; ++i) { - keep_data[i] = selected_indices[i]; - } - return keep_nms; -} - -template -static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) { - if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || - box2[3] < box1[1]) { - return static_cast(0.); - } else { - const T inter_xmin = std::max(box1[0], box2[0]); - const T inter_ymin = std::max(box1[1], box2[1]); - const T inter_xmax = std::min(box1[2], box2[2]); - const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1); - const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1); - const T inter_area = inter_w * inter_h; - const T bbox1_area = BBoxArea(box1, normalized); - const T bbox2_area = BBoxArea(box2, normalized); - return inter_area / (bbox1_area + bbox2_area - inter_area); - } -} - -template -static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold, - float eta, int post_nms_num = 100) { - int64_t num_boxes = bbox->dims()[0]; - // 4: [xmin ymin xmax ymax] - int64_t box_size = bbox->dims()[1]; - - std::vector scores_data(num_boxes); - std::copy_n(scores->data(), num_boxes, scores_data.begin()); - std::vector> sorted_indices = - GetSortedScoreIndex(scores_data); - - std::vector selected_indices; - int selected_num = 0; - T adaptive_threshold = nms_threshold; - const T *bbox_data = bbox->data(); - while ((sorted_indices.size() != 0) && (selected_num < post_nms_num)) { - int idx = sorted_indices.back().second; - bool flag = true; - for (int kept_idx : selected_indices) { - if (flag) { - T overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, false); - flag = (overlap <= adaptive_threshold); - } else { - break; - } - } - if (flag) { - selected_indices.push_back(idx); - ++selected_num; - } - sorted_indices.erase(sorted_indices.end() - 1); - if (flag && eta < 1 && adaptive_threshold > 0.5) { - adaptive_threshold *= eta; - } - } - return VectorToTensor(selected_indices, selected_num); -} - -template -std::pair ProposalForOneImage( - const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, - const Tensor &bbox_deltas_slice, // [M, 4] - const Tensor &scores_slice, // [N, 1] - const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n, - float nms_thresh, float min_size, float eta) { - auto *scores_data = scores_slice.data(); - // Sort index - Tensor index_t; - index_t.Resize({scores_slice.numel()}); - int *index = index_t.mutable_data(); - std::memcpy(index, score_index.data(), - scores_slice.numel() * sizeof(int)); - - auto compare = [scores_data](const int64_t &i, const int64_t &j) { - return scores_data[i] > scores_data[j]; - }; - - if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { - std::sort(index, index + scores_slice.numel(), compare); - } else { - std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(), - compare); - index_t.Resize({pre_nms_top_n}); - } - - Tensor scores_sel, bbox_sel, anchor_sel, var_sel; - scores_sel.mutable_data({index_t.numel(), 1}); - bbox_sel.mutable_data({index_t.numel(), 4}); - anchor_sel.mutable_data({index_t.numel(), 4}); - var_sel.mutable_data({index_t.numel(), 4}); - - CPUGather(scores_slice, index_t, &scores_sel); - CPUGather(bbox_deltas_slice, index_t, &bbox_sel); - CPUGather(anchors, index_t, &anchor_sel); - Tensor proposals; - proposals.mutable_data({index_t.numel(), 4}); - BoxCoder(&anchor_sel, &bbox_sel, &proposals); - - ClipTiledBoxes(im_info_slice, &proposals); - - Tensor keep; - FilterBoxes(&proposals, min_size, im_info_slice, &keep); - - Tensor scores_filter; - bbox_sel.mutable_data({keep.numel(), 4}); - scores_filter.mutable_data({keep.numel(), 1}); - - CPUGather(proposals, keep, &bbox_sel); - CPUGather(scores_sel, keep, &scores_filter); - if (nms_thresh <= 0) { - return std::make_pair(bbox_sel, scores_filter); - } - - Tensor keep_nms = - NMS(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n); - - if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { - keep_nms.Resize({post_nms_top_n}); - } - - proposals.mutable_data({keep_nms.numel(), 4}); // original - scores_sel.mutable_data({keep_nms.numel(), 1}); // original - - CPUGather(bbox_sel, keep_nms, &proposals); - CPUGather(scores_filter, keep_nms, &scores_sel); - return std::make_pair(proposals, scores_sel); -} - -template <> -void ProposalKernel::Compute(const ProposalParam ¶m) { - auto input_score = param.scores_; - auto input_score_data = input_score->data(); - uint32_t score_n, score_height, score_width, score_channels; - - auto input_bbox = param.bbox_deltas_; - auto input_bbox_data = input_bbox->data(); - uint32_t bbox_n, bbox_height, bbox_width, bbox_channels; - - score_n = (uint32_t)(input_score->dims()[0]); - score_channels = (uint32_t)(input_score->dims()[1]); - score_height = (uint32_t)(input_score->dims()[2]); - score_width = (uint32_t)(input_score->dims()[3]); - - bbox_n = (uint32_t)(input_bbox->dims()[0]); - bbox_channels = (uint32_t)(input_bbox->dims()[1]); - bbox_height = (uint32_t)(input_bbox->dims()[2]); - bbox_width = (uint32_t)(input_bbox->dims()[3]); - - int64_t amount_per_side = score_width * score_height; - - int alignedCW = - fpga::align_to_x(score_width * score_channels, IMAGE_ALIGNMENT); - int unalignedCW = score_width * score_channels; - fpga::fpga_invalidate(input_score_data, - score_height * alignedCW * sizeof(int8_t)); - - Tensor score_tensor = *input_score; - for (int h = 0; h < score_height; h++) { - for (int w = 0; w < score_width; w++) { - for (int c = 0; c < score_channels; ++c) { - int dstidx = h * unalignedCW + w * score_channels + c; - int srcidx = h * alignedCW + w * score_channels + c; - score_tensor.data()[dstidx] = input_score_data[srcidx]; - } - } - } - - amount_per_side = bbox_width * bbox_height; - alignedCW = fpga::align_to_x(bbox_width * bbox_channels, IMAGE_ALIGNMENT); - unalignedCW = bbox_width * bbox_channels; - fpga::fpga_invalidate(input_bbox_data, - bbox_height * alignedCW * sizeof(int8_t)); - - auto bbox_tensor = param.float_bbox.get(); - for (int h = 0; h < bbox_height; h++) { - for (int w = 0; w < bbox_width; w++) { - for (int c = 0; c < bbox_channels; ++c) { - int dstidx = h * unalignedCW + w * bbox_channels + c; - int srcidx = h * alignedCW + w * bbox_channels + c; - bbox_tensor->data()[dstidx] = - (static_cast(input_bbox_data[srcidx])) / 127.0 * - input_bbox->scale[0]; - } - } - } - auto *im_info = param.im_info_; - auto anchors = *param.anchors_; - auto variances = *param.variances_; - - auto *rpn_rois = param.rpn_rois_; - auto *rpn_roi_probs = param.rpn_probs_; - - auto score_index = *(param.score_index_.get()); - - int pre_nms_top_n = param.pre_nms_topn_; - int post_nms_top_n = param.post_nms_topn_; - - float nms_thresh = param.nms_thresh_ / 2.0f; - float min_size = param.min_size_; - float eta = param.eta_; - - rpn_rois->mutable_data({bbox_tensor->numel() / 4, 4}); - rpn_roi_probs->mutable_data({input_score->numel() / 4, 1}); - framework::LoD lod; - lod.resize(1); - auto &lod0 = lod[0]; - lod0.push_back(0); - anchors.Resize({anchors.numel() / 4, 4}); - variances.Resize({variances.numel() / 4, 4}); - - int64_t num_proposals = 0; - for (int64_t i = 0; i < score_n; ++i) { - Tensor im_info_slice = im_info->Slice(i, i + 1); - Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1); - Tensor scores_slice = score_tensor.Slice(i, i + 1); - - bbox_deltas_slice.Resize({bbox_height * bbox_width * bbox_channels / 4, 4}); - scores_slice.Resize({score_height * score_width * score_channels, 1}); - std::pair tensor_pair = ProposalForOneImage( - im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, - score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); - Tensor &proposals = tensor_pair.first; - Tensor &scores = tensor_pair.second; - - AppendProposals(rpn_rois, 4 * num_proposals, proposals); - AppendProposals(rpn_roi_probs, num_proposals, scores); - num_proposals += proposals.dims()[0]; - lod0.push_back(num_proposals); - } - rpn_rois->set_lod(lod); - rpn_roi_probs->set_lod(lod); - rpn_rois->Resize({num_proposals, 4}); - rpn_roi_probs->Resize({num_proposals, 1}); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PROPOSAL_OP diff --git a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp deleted file mode 100644 index 00c0b5d631..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PSROI_POOL_OP - -#include -#include -#include "operators/kernel/detection_kernel.h" - -#include "fpga/V2/api.h" -#include "fpga/V2/image.h" -namespace paddle_mobile { -namespace operators { - -template <> -bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { - auto dims = param->input_x_->dims(); - PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, - "data not aligned"); - - param->float_input = std::make_shared(); - param->float_input->mutable_data(param->input_x_->dims()); - - auto* rois = param->input_rois_; - int rois_num = rois->dims()[0]; - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, param->output_->dims()[1], param->output_->dims()[2], - param->output_->dims()[3]}); - param->output_->Resize(dims_out_new); - - param->output_->mutable_data(dims_out_new); - return true; -} - -template -void PSROIPoolingForward(const int8_t* bottom_data, const int height, - const int width, const int input_channel, - Dtype* top_data, const int pooled_height, - const int pooled_width, const int output_channel, - const Dtype* bottom_rois, const Dtype Bin_size_h, - const Dtype Bin_size_w, const Dtype roi_start_h, - const Dtype roi_start_w, const int pw, const int ph, - float scale, const int roi_batch_ind) { - int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * Bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - float avg_pixels_c[output_channel] = {0}; - int sum_pixels_c[output_channel] = {0}; - int8_t pixels_c[output_channel] = {0}; - if (!is_empty) { - Dtype bin_area = (hend - hstart) * (wend - wstart); - float scale_fuse = scale / bin_area; - - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int pixel_offset = (h * width + w) * input_channel; - for (int output_c = 0; output_c < output_channel; output_c++) { - int input_channel_offset = output_c * pooled_height * pooled_width; - int input_bias = - pixel_offset + input_channel_offset + ph * pooled_width + pw; - pixels_c[output_c] = bottom_data[input_bias]; - } - - for (int output_c = 0; output_c < output_channel; output_c++) { - sum_pixels_c[output_c] += pixels_c[output_c]; - } - } - } - for (int output_c = 0; output_c < output_channel; output_c++) { - avg_pixels_c[output_c] = sum_pixels_c[output_c] * scale_fuse; - } - } - - int output_index_base = (ph * pooled_width + pw) * output_channel; - top_data += output_index_base; - memcpy(top_data, avg_pixels_c, output_channel * 4); -} - -template <> -void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { - auto input_tensor = param.input_x_; - auto input_data = input_tensor->data(); - auto scale = input_tensor->scale[0] / 127.0; - fpga::fpga_invalidate(input_data, input_tensor->numel() * sizeof(int8_t)); - auto* rois = param.input_rois_; - auto* out = param.output_; - - auto pooled_height = param.pooled_height_; - auto pooled_width = param.pooled_width_; - auto spatial_scale = param.spatial_scale_; - auto output_channels = param.output_channels_; - - auto in_dims = input_tensor->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), - (param.output_)->dims()[3]}); - - (param.output_)->Resize(dims_out_new); - - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - auto rois_batch_id_data = rois_batch_id_list.mutable_data(); - - PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_MOBILE_ENFORCE( - rois_batch_size == batch_size, - "the rois_batch_size and input(X) batch_size should be the same."); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num, - "the rois_num from input and lod must be the same"); - - PADDLE_MOBILE_ENFORCE( - input_channels == output_channels * pooled_height * pooled_width, - "the channels of input X should equal the product of " - "output_channels x pooled_height x pooled_width"); - - auto output_data = out->mutable_data(); - auto input_rois = rois->data(); - - for (int n = 0; n < rois_num; ++n) { - auto offset_input_rois = input_rois + n * 4; - auto offset_output_data = - output_data + pooled_height * pooled_width * output_channels * n; - - auto roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - auto roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - auto roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - auto roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small rois to be 1 x 1 - auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0 - auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f); - - // Compute bin size w and h at input feature map - auto bin_size_h = roi_height / static_cast(pooled_height); - auto bin_size_w = roi_width / static_cast(pooled_width); - - int roi_batch_ind = rois_batch_id_data[n]; - - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - PSROIPoolingForward(input_data, height, width, input_channels, - offset_output_data, pooled_height, - pooled_width, output_channels, input_rois, - bin_size_h, bin_size_w, roi_start_h, - roi_start_w, pw, ph, scale, roi_batch_ind); - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // PSROI_POOL_OP diff --git a/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp deleted file mode 100644 index 6fff10f620..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RELU_OP - -#include "operators/kernel/activation_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ReluKernel::Init(ReluParam *param) { - param->Out()->ShareDataWith(*param->InputX()); - return true; -} - -template <> -void ReluKernel::Compute(const ReluParam ¶m) {} -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp deleted file mode 100644 index 5b651ad6e6..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE2_OP - -#include "operators/kernel/reshape2_kernel.h" -#include "framework/ddim.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Reshape2Kernel::Init(Reshape2Param *param) { - auto input = const_cast(param->InputX()); - auto output = param->Out(); - auto shape = param->Shape(); - output->scale[0] = input->scale[0]; - - auto num_in = framework::product(input->dims()); - auto num_shape = framework::product(framework::make_ddim(shape)); - PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported"); - - for (int i = 0; i < shape.size(); i++) { - if (shape[i] == -1) { - shape[i] = static_cast(-num_in / num_shape); - break; - } - } - output->Resize(framework::make_ddim(shape)); - output->set_type(input->type()); - fpga::format_ofm(output); - DLOG << "input: " << input; - DLOG << "output: " << output; - - return true; -} - -void reshape(LoDTensor *input, LoDTensor *output) { - // Subscript r means after reshape - - auto input_ptr = input->data(); - auto output_ptr = output->data(); - output->scale[0] = input->scale[0]; - output->scale[1] = input->scale[1]; - - auto C = static_cast(input->dims()[1]); - auto H = static_cast(input->dims()[2]); - auto W = static_cast(input->dims()[3]); - auto Cr = static_cast(output->dims()[1]); - auto Hr = static_cast(output->dims()[2]); - auto Wr = static_cast(output->dims()[3]); - PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match"); - auto WC = W * C; - auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT); - auto HW = H * W; - auto WCr = Wr * Cr; - auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT); - auto HWr = Hr * Wr; - - fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(int8_t)); - - int offset_align = 0; - int offset_r = 0, offset_align_r = 0; - int cr = 0, hr = 0, wr = 0; - - for (int h = 0; h < H; h++) { - int offset0 = h * WC_align; - for (int w = 0; w < W; w++) { - int offset1 = w * C + offset0; - for (int c = 0; c < C; c++) { - offset_align = offset1 + c; - offset_r = c * HW + h * W + w; - cr = offset_r / HWr; - hr = offset_r % HWr / Wr; - wr = offset_r % Wr; - offset_align_r = hr * WCr_align + wr * Cr + cr; - output_ptr[offset_align_r] = input_ptr[offset_align]; - } - } - } - - fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(int8_t)); -} - -template <> -void Reshape2Kernel::Compute(const Reshape2Param ¶m) { - auto input = const_cast(param.InputX()); - auto output = param.Out(); - auto shape = param.Shape(); - - auto num_in = framework::product(input->dims()); - auto num_shape = framework::product(framework::make_ddim(shape)); - PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported"); - - for (int i = 0; i < shape.size(); i++) { - if (shape[i] == -1) { - shape[i] = static_cast(-num_in / num_shape); - break; - } - } - output->Resize(framework::make_ddim(shape)); - - bool reshapeNeedFlg = 1; - if (output->dims() == input->dims()) { - reshapeNeedFlg = 0; - } else if (output->dims().size() != input->dims().size()) { - auto inputdimsize = input->dims().size(); - auto outputdimsize = output->dims().size(); - int smallersize = - inputdimsize > outputdimsize ? outputdimsize : inputdimsize; - int i = 0; - for (i = 0; i < smallersize; i++) { - if ((input->dims())[i] != (output->dims())[i]) break; - } - if (i == smallersize) { - reshapeNeedFlg = 0; - } - } - if (reshapeNeedFlg) { - reshape(input, output); - } else { - DLOG << "No need to reshape"; - output->ShareDataWith(*input); - framework::LoD lod = input->lod(); - output->set_lod(lod); - output->scale[0] = input->scale[0]; - return; - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp deleted file mode 100644 index 5e01bb74ba..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE_OP - -#include "operators/kernel/reshape_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ReshapeKernel::Init(ReshapeParam *param) { - param->Out()->ShareDataWith(*param->InputX()); - const int in_n = param->InputX()->dims()[0]; - const int in_c = param->InputX()->dims()[1]; - const int in_h = param->InputX()->dims()[2]; - const int in_w = param->InputX()->dims()[3]; - auto out = param->Out(); - out->Resize(framework::make_ddim({in_n, in_c * in_h * in_w})); - return true; -} - -template <> -void ReshapeKernel::Compute(const ReshapeParam ¶m) {} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp deleted file mode 100644 index 985f0fc94c..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp +++ /dev/null @@ -1,296 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ROIALIGN_POOL_OP - -#include -#include -#include "operators/kernel/detection_kernel.h" - -#include "fpga/V2/api.h" -#include "fpga/V2/image.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool RoiAlignPoolKernel::Init(RoiAlignPoolParam* param) { - auto dims = param->input_x_->dims(); - PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, - "data not aligned"); - - param->float_input = std::make_shared(); - param->float_input->mutable_data(param->input_x_->dims()); - - auto input = param->input_x_; - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_HWC; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->float_input->mutable_data(); - args.output.scale_address = param->float_input->scale; - param->input_arg = args; - - auto* rois = param->input_rois_; - int rois_num = rois->dims()[0]; - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, param->output_->dims()[1], param->output_->dims()[2], - param->output_->dims()[3]}); - param->output_->Resize(dims_out_new); - - param->output_->mutable_data(dims_out_new); - - return true; -} - -template -struct PreCalc { - int pos1; - int pos2; - int pos3; - int pos4; - T w1; - T w2; - T w3; - T w4; -}; - -template -void pre_calc_for_bilinear_interpolate( - const int height, const int width, const int pooled_height, - const int pooled_width, const int iy_upper, const int ix_upper, - T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, - int roi_bin_grid_h, int roi_bin_grid_w, - std::vector>& pre_calc) { // NOLINT - int pre_calc_index = 0; - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - for (int iy = 0; iy < iy_upper; iy++) { - const T yy = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 - for (int ix = 0; ix < ix_upper; ix++) { - const T xx = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - - T x = xx; - T y = yy; - // deal with: inverse elements are out of feature map boundary - if (y < -1.0 || y > height || x < -1.0 || x > width) { - // empty - PreCalc pc; - pc.pos1 = 0; - pc.pos2 = 0; - pc.pos3 = 0; - pc.pos4 = 0; - pc.w1 = 0; - pc.w2 = 0; - pc.w3 = 0; - pc.w4 = 0; - pre_calc[pre_calc_index] = pc; - pre_calc_index += 1; - continue; - } - - if (y <= 0) { - y = 0; - } - if (x <= 0) { - x = 0; - } - - int y_low = static_cast(y); - int x_low = static_cast(x); - int y_high; - int x_high; - - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = (T)y_low; - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = (T)x_low; - } else { - x_high = x_low + 1; - } - - T ly = y - y_low; - T lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - // save weights and indeces - PreCalc pc; - pc.pos1 = y_low * width + x_low; - pc.pos2 = y_low * width + x_high; - pc.pos3 = y_high * width + x_low; - pc.pos4 = y_high * width + x_high; - pc.w1 = w1; - pc.w2 = w2; - pc.w3 = w3; - pc.w4 = w4; - pre_calc[pre_calc_index] = pc; - - pre_calc_index += 1; - } - } - } - } -} - -template -void ROIAlignForward(const int nthreads, const T* bottom_data, - const T& spatial_scale, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int sampling_ratio, - const T* bottom_rois, T* top_data) { - int n_rois = nthreads / channels / pooled_width / pooled_height; - - for (int n = 0; n < n_rois; n++) { - int index_n = n * channels * pooled_width * pooled_height; - - // roi could have 4 or 5 columns - const T* offset_bottom_rois = bottom_rois + n * 4; - int roi_batch_ind = 0; - // if (roi_cols == 5) { - // roi_batch_ind = offset_bottom_rois[0]; - // offset_bottom_rois++; - // } - - // Do not using rounding; this implementation detail is critical - T roi_start_w = offset_bottom_rois[0] * spatial_scale; - T roi_start_h = offset_bottom_rois[1] * spatial_scale; - T roi_end_w = offset_bottom_rois[2] * spatial_scale; - T roi_end_h = offset_bottom_rois[3] * spatial_scale; - // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); - // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); - // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); - // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); - - // Force malformed ROIs to be 1x1 - T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); - T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - // We use roi_bin_grid to sample the grid and mimic integral - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); // e.g., = 2 - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - - // We do average (integral) pooling inside a bin - const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 - - // we want to precalculate indeces and weights shared by all chanels, - // this is the key point of optimiation - std::vector> pre_calc(roi_bin_grid_h * roi_bin_grid_w * - pooled_width * pooled_height); - pre_calc_for_bilinear_interpolate( - height, width, pooled_height, pooled_width, roi_bin_grid_h, - roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, - roi_bin_grid_h, roi_bin_grid_w, pre_calc); - - for (int c = 0; c < channels; c++) { - int index_n_c = index_n + c * pooled_width * pooled_height; - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; - int pre_calc_index = 0; - - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - int index = index_n_c + ph * pooled_width + pw; - - T output_val = 0.; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - PreCalc pc = pre_calc[pre_calc_index]; - output_val += pc.w1 * offset_bottom_data[pc.pos1] + - pc.w2 * offset_bottom_data[pc.pos2] + - pc.w3 * offset_bottom_data[pc.pos3] + - pc.w4 * offset_bottom_data[pc.pos4]; - - pre_calc_index += 1; - } - } - output_val /= count; - - top_data[index] = output_val; - } // for pw - } // for ph - } // for c - } // for n -} - -template <> -void RoiAlignPoolKernel::Compute( - const RoiAlignPoolParam& param) { - auto input_tensor = param.float_input.get(); - fpga::PerformBypass(param.input_arg); - fpga::fpga_invalidate(input_tensor->data(), - input_tensor->numel() * sizeof(float)); - - auto* in = input_tensor; - auto* rois = param.input_rois_; - auto* out = param.output_; // param.float_output.get(); - - auto pooled_height = param.pooled_height_; - auto pooled_width = param.pooled_width_; - auto spatial_scale = param.spatial_scale_; - auto sampe_ratio = param.sampling_ratio_; - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto data_nhwc = in->mutable_data(); - - fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); - framework::DDim dims_out_new = framework::make_ddim( - {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), - (param.output_)->dims()[3]}); - (param.output_)->Resize(dims_out_new); - - const int index = input_channels * pooled_height * pooled_width * rois_num; - auto rois_data = rois->data(); - auto top_data = param.output_->mutable_data(); - for (int i = 0; i < index; ++i) { - ROIAlignForward(index, data_nhwc, spatial_scale, input_channels, - height, width, pooled_height, pooled_width, - sampe_ratio, rois_data, top_data); - } - - fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height, - pooled_width, rois_num); - out->reset_data_ptr(top_data); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // ROIALIGN_POOL_OP diff --git a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp deleted file mode 100644 index 44aae4be32..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SIGMOID_OP - -#include "operators/kernel/activation_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SigmoidKernel::Init(SigmoidParam *param) { - auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::SIGMOID; - int16_t leaky_relu_negative_slope = - fpga::fp32_2_fp16(input->scale[0] / 127.0); - auto out = param->Out(); - fpga::format_ofm(out); - - fpga::BypassArgs args = {fpga::DATA_TYPE_INT8}; - args.input_data_type = fpga::DATA_TYPE_INT8; - args.output_data_type = fpga::DATA_TYPE_INT8; - args.image.address = input_ptr; - args.image.height = 1; - args.image.width = 1; - args.image.channels = input->fpga_data_num; - args.output.address = out->data(); - args.output.scale_address = out->scale; - args.output.activation.activation_type = activation_enable; - args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; - param->SetFpgaArgs(args); - return true; -} - -template <> -void SigmoidKernel::Compute(const SigmoidParam ¶m) { - fpga::PerformBypass(param.FpgaArgs()); - param.Out()->scale[0] = 1.0; -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp deleted file mode 100644 index e40242d5c2..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SLICE_OP - -#include "operators/kernel/slice_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SliceKernel::Init(SliceParam* param) { - auto output = param->output_; - fpga::format_ofm(output); - DLOG << "input: " << param->input_; - DLOG << "output: " << param->output_; - if (param->input_->type() != type_id()) { - DLOG << "wrong type"; - } - return true; -} - -template <> -void SliceKernel::Compute(const SliceParam& param) { - // Only support slicing in channel dimension - // Only support half data - // W must be aligned to 16 - - auto input = param.input_; - auto output = param.output_; - int H = input->dims()[2]; - int W = input->dims()[3]; - int HW = input->dims()[2] * input->dims()[3]; - int channel = input->dims()[1]; - auto input_ptr = input->data(); - auto output_ptr = output->data(); - - output->scale[0] = input->scale[0]; - output->scale[1] = input->scale[1]; - - int start = param.starts_[0], end = param.ends_[0]; - start = start < 0 ? start + channel : start; - end = end < 0 ? end + channel : end; - start = start > channel ? channel : start; - end = end > channel ? channel : end; - int len = end - start; - size_t size = len * sizeof(int8_t); - DLOG << input->fpga_data_num; - fpga::fpga_invalidate(input_ptr, input->fpga_data_num * sizeof(int8_t)); - DLOG << output->fpga_data_num; - fpga::fpga_invalidate(output_ptr, output->fpga_data_num * sizeof(int8_t)); - int unalignedWC = len * W; - int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT); - - if (unalignedWC != alignedWC) { - auto tmpOutput = - reinterpret_cast(fpga::fpga_malloc(len * HW * sizeof(int8_t))); - for (int i = 0; i < HW; i++) { - memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size); - } - for (int i = 0; i < H; i++) { - for (int j = 0; j < unalignedWC; j++) { - *(output_ptr + alignedWC * i + j) = *(tmpOutput + unalignedWC * i + j); - } - } - fpga::fpga_free(tmpOutput); - } else { - for (int i = 0; i < HW; i++) { - memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); - } - } - fpga::fpga_flush(output_ptr, output->fpga_data_num * sizeof(int8_t)); -} -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp deleted file mode 100755 index 843f249c68..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#include "operators/kernel/softmax_kernel.h" -#include "operators/kernel/central-arm-func/softmax_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool SoftmaxKernel::Init(SoftmaxParam *param) { - auto input = const_cast(param->InputX()); - auto dims = framework::vectorize(input->dims()); - - auto out = param->Out(); - out->Resize(framework::make_ddim(dims)); - - int input_c = 1, input_h = 1, input_w = 1; - if (dims.size() == 4) { - input_h = dims[1]; - input_w = dims[2]; - input_c = dims[3]; - if (input_c == 1) { // This input is generated by FC op, dims = [N C 1 1] - PADDLE_MOBILE_ENFORCE(input_w == 1, "Softmax input must come from FC op"); - input_c = dims[1]; - input_h = 1; - } - } else if (dims.size() == 2) { - input_c = dims[1]; - } - - input->Resize(framework::make_ddim(dims)); - if ((input_c == 2) && (input->type() == type_id())) { - auto input_ptr = input->data(); - float Si = input->scale[0]; - int16_t slope = fpga::fp32_2_fp16(Si / 127); - out->mutable_data(framework::make_ddim(dims)); - fpga::format_ofm(out); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_CHW; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = input_ptr; - args.image.height = input_h; - args.image.width = input_w; - args.image.channels = input_c; - args.output.address = out->data(); - args.output.scale_address = out->scale; - args.output.activation.activation_type = fpga::SOFTMAX; - args.output.activation.leaky_relu_negative_slope = slope; - param->SetFpgaArgs(args); - } else { - out->mutable_data(framework::make_ddim(dims)); - fpga::format_ofm(out); - } - - return true; -} - -template <> -void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { - auto *in_x = (param.InputX()); - auto dims = in_x->dims(); - - auto n = 1; - auto h = 1; - auto w = 1; - auto c = 1; - if (dims.size() == 4) { - n = dims[0]; - h = dims[1]; - w = dims[2]; - c = dims[3]; - if (c == 1) { // This input is generated by FC op, dims = [N C 1 1] - PADDLE_MOBILE_ENFORCE(w == 1, "Softmax input must come from FC op"); - c = dims[1]; - h = 1; - } - } else if (dims.size() == 2) { - n = dims[0]; - c = dims[1]; - } - if ((c == 2) && (in_x->type() == type_id())) { - fpga::PerformBypass(param.FpgaArgs()); - } else if (in_x->type() == type_id()) { - auto in_data = in_x->data(); - float Si = in_x->scale[0]; - Tensor *out = param.Out(); - out->Resize({n, h, w, c}); - auto float_input_x = param.float_input_x_; - float_input_x = std::make_shared(); - float_input_x->Resize(in_x->dims()); - float_input_x->init(type_id().hash_code()); - fpga::format_fp32_ofm(float_input_x.get()); - auto float_input_x_data = float_input_x->data(); - int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT); - for (int i = 0; i < dataNum; i++) { - float_input_x_data[i] = in_data[i] * Si / 127; - } - math::SoftmaxFuntor()(float_input_x.get(), out); - } else { - Tensor *out = param.Out(); - out->Resize({n, h, w, c}); - math::SoftmaxFuntor()(in_x, out); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp deleted file mode 100644 index af3fe9df00..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SPLIT_OP - -#include "operators/kernel/split_kernel.h" - -namespace paddle_mobile { -namespace operators { -template <> -bool SplitKernel::Init(SplitParam *param) { - auto *in = const_cast(param->InputX()); - auto outs = param->Outs(); - auto sections = param->Sections(); - int axis = param->Axis(); - PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension"); - PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(), - "Output number should be equal to section number"); - auto image_num = (uint32_t)outs.size(); - auto images_out = - reinterpret_cast(fpga::fpga_malloc(image_num * sizeof(void *))); - auto scales_out = reinterpret_cast( - fpga::fpga_malloc(image_num * sizeof(float *))); - auto out_channels = reinterpret_cast( - fpga::fpga_malloc(image_num * sizeof(uint32_t))); - DLOG << "input: " << in; - for (int i = 0; i < image_num; i++) { - fpga::format_ofm(outs[i]); - DLOG << "output: " << outs[i]; - images_out[i] = outs[i]->mutable_data(); - scales_out[i] = outs[i]->scale; - out_channels[i] = (uint32_t)sections[i]; - } - - auto deleter = [](void *p) { fpga::fpga_free(p); }; - - fpga::SplitArgs arg = {0}; - arg.image_num = image_num; - arg.image_in = in->data(); - arg.scale_in = in->scale; - arg.images_out = images_out; - arg.scales_out = scales_out; - arg.out_channel_nums = out_channels; - arg.height = (uint32_t)in->dims()[2]; - arg.width = (uint32_t)in->dims()[3]; - arg.vector_split_space.push_back( - std::shared_ptr(reinterpret_cast(images_out), deleter)); - arg.vector_split_space.push_back( - std::shared_ptr(reinterpret_cast(scales_out), deleter)); - arg.vector_split_space.push_back( - std::shared_ptr(reinterpret_cast(out_channels), deleter)); - - param->SetFpgaArgs(arg); - return true; -} -template <> -void SplitKernel::Compute(const SplitParam ¶m) { - fpga::ComputeFPGASplit(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp deleted file mode 100644 index 670689e083..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TANH_OP - -#include "operators/kernel/tanh_kernel.h" -#include -namespace paddle_mobile { -namespace operators { - -template <> -bool TanhKernel::Init(TanhParam *param) { - auto input = const_cast(param->InputX()); - DLOG << "input: " << input; - auto input_ptr = input->data(); - auto float_input = new LoDTensor; - - float_input->mutable_data( - {1, input->dims()[1], input->dims()[2], input->dims()[3]}); - fpga::format_fp32_ofm(float_input); - - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_CHW; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input_ptr; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = float_input->data(); - args.output.scale_address = float_input->scale; - param->SetFloatInput(float_input); - param->SetFpgaArgs(args); - return true; -} - -#define EXP_MAX_INPUT 40.0 -template -T Tanh(const T a) { - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} -template -void tanhFuntor(Tensor *input, Tensor *output) { - auto *input_ptr = input->data(); - auto *output_ptr = output->mutable_data(); - for (int i = 0; i < input->numel(); i++) { - *(output_ptr + i) = Tanh(*(input_ptr + i)); - } -} -template <> -void TanhKernel::Compute(const TanhParam ¶m) { - Tensor *in_x = param.FloatInput(); - Tensor *out = param.Out(); - - fpga::PerformBypass(param.FpgaArgs()); - fpga::fpga_invalidate(reinterpret_cast(in_x->data()), - in_x->numel() * sizeof(float)); - tanhFuntor(in_x, out); - fpga::fpga_flush(out->data(), out->memory_size()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp deleted file mode 100644 index cc839a971e..0000000000 --- a/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef TRANSPOSE2_OP - -#include "operators/kernel/transpose2_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool Transpose2Kernel::Init(Transpose2Param *param) { - auto input = param->InputX(); - auto output = param->Out(); - auto axis = param->Axis(); - auto dim = input->dims(); - output->ShareDataWith(*input); - - auto dim_v = vectorize(dim); - - for (int i = 0; i < axis.size(); i++) { - dim_v[i] = dim[axis[i]]; - } - output->Resize(framework::make_ddim(dim_v)); - - DLOG << "input: " << input; - DLOG << "output: " << output; - return true; -} - -template <> -void Transpose2Kernel::Compute( - const Transpose2Param ¶m) { - // Transpose2Compute(param); - auto input = param.InputX(); - auto output = param.Out(); - - output->Resize({input->dims()[0], output->dims()[1], output->dims()[2], - output->dims()[3]}); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/fusion_fc_kernel.h b/mobile/src/operators/kernel/fusion_fc_kernel.h deleted file mode 100644 index b8086bc66f..0000000000 --- a/mobile/src/operators/kernel/fusion_fc_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_FC_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/math/math_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class FusionFcKernel - : public framework::OpKernelBase> { - public: - void Compute(const FusionFcParam& param); - bool Init(FusionFcParam* param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/grid_sampler_kernel.h b/mobile/src/operators/kernel/grid_sampler_kernel.h deleted file mode 100644 index bbadb6b54a..0000000000 --- a/mobile/src/operators/kernel/grid_sampler_kernel.h +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef GRID_SAMPLER_OP -DECLARE_KERNEL(GridSampler, GridSamplerParam); -#endif // GRID_SAMPLER_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/gru_kernel.h b/mobile/src/operators/kernel/gru_kernel.h deleted file mode 100644 index b03b2e3ecb..0000000000 --- a/mobile/src/operators/kernel/gru_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class GruKernel - : public framework::OpKernelBase> { - public: - void Compute(const GruParam& param); - bool Init(GruParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/gru_unit_kernel.h b/mobile/src/operators/kernel/gru_unit_kernel.h deleted file mode 100644 index bda17cd205..0000000000 --- a/mobile/src/operators/kernel/gru_unit_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_UNIT_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class GruUnitKernel - : public framework::OpKernelBase> { - public: - void Compute(const GruUnitParam& param); - bool Init(GruUnitParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/im2sequence_kernel.h b/mobile/src/operators/kernel/im2sequence_kernel.h deleted file mode 100644 index b15eb68996..0000000000 --- a/mobile/src/operators/kernel/im2sequence_kernel.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef IM2SEQUENCE_OP - -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -#pragma once - -namespace paddle_mobile { -namespace operators { - -template -class Im2SequenceKernel - : public framework::OpKernelBase> { - public: - void Compute(const Im2SequenceParam& param); - bool Init(Im2SequenceParam* para); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/increment_kernel.h b/mobile/src/operators/kernel/increment_kernel.h deleted file mode 100644 index 43a930c1b9..0000000000 --- a/mobile/src/operators/kernel/increment_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INCREMENT_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class IncrementKernel - : public framework::OpKernelBase> { - public: - void Compute(const IncrementParam ¶m); - bool Init(IncrementParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/instancenorm_kernel.h b/mobile/src/operators/kernel/instancenorm_kernel.h deleted file mode 100644 index 2333d0cc0f..0000000000 --- a/mobile/src/operators/kernel/instancenorm_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef INSTANCENORM_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class InstanceNormKernel - : public framework::OpKernelBase> { - public: - void Compute(const InstanceNormParam ¶m); - bool Init(InstanceNormParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/instancenorm_relu_kernel.h b/mobile/src/operators/kernel/instancenorm_relu_kernel.h deleted file mode 100644 index cb2a0e1f3c..0000000000 --- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_INSTANCENORM_RELU_OP - -#include -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class InstanceNormReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionInstanceNormReluParam ¶m); - bool Init(FusionInstanceNormReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/is_empty_kernel.h b/mobile/src/operators/kernel/is_empty_kernel.h deleted file mode 100644 index 0a6806d087..0000000000 --- a/mobile/src/operators/kernel/is_empty_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef IS_EMPTY_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class IsEmptyKernel - : public framework::OpKernelBase> { - public: - void Compute(const IsEmptyParam ¶m); - bool Init(IsEmptyParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/kernels.h b/mobile/src/operators/kernel/kernels.h deleted file mode 100644 index 668344674c..0000000000 --- a/mobile/src/operators/kernel/kernels.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef TOP_K_OP -DECLARE_KERNEL(TopK, TopKParam); -#endif // TOP_K_OP - -#ifdef CAST_OP -DECLARE_KERNEL(Cast, CastParam); -#endif // CAST_OP - -#ifdef LOD_RESET_OP -DECLARE_KERNEL(LodReset, LodResetParam); -#endif // LOD_RESET_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/logical_kernel.h b/mobile/src/operators/kernel/logical_kernel.h deleted file mode 100644 index b42ae27005..0000000000 --- a/mobile/src/operators/kernel/logical_kernel.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef LOGICAL_AND_OP -DECLARE_KERNEL(LogicalAnd, LogicalBinaryParam); -#endif - -#ifdef LOGICAL_OR_OP -DECLARE_KERNEL(LogicalOr, LogicalBinaryParam); -#endif - -#ifdef LOGICAL_NOT_OP -DECLARE_KERNEL(LogicalNot, LogicalUnaryParam); -#endif - -#ifdef LOGICAL_XOR_OP -DECLARE_KERNEL(LogicalXor, LogicalBinaryParam); -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/lookup_kernel.h b/mobile/src/operators/kernel/lookup_kernel.h deleted file mode 100644 index 8c29349e73..0000000000 --- a/mobile/src/operators/kernel/lookup_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LOOKUP_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class LookupKernel - : public framework::OpKernelBase> { - public: - void Compute(const LookupParam& param); - bool Init(LookupParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/lrn_kernel.h b/mobile/src/operators/kernel/lrn_kernel.h deleted file mode 100644 index 486c828aca..0000000000 --- a/mobile/src/operators/kernel/lrn_kernel.h +++ /dev/null @@ -1,181 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef LRN_OP - -#include -#ifdef _OPENMP -#include -#endif -#ifdef __ARM_NEON -#include -#include "operators/math/math.h" -#endif -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -struct LRNFunctor { - void operator()(const framework::Tensor &input, framework::Tensor *out, int N, - int C, int H, int W, int n, float k, float alpha, - float beta) { - const float *input_ptr = input.data(); - const int start = -(n - 1) / 2; - const int end = start + n; - auto out_ptr = out->data(); - - const int stride0 = C * H * W; - const int stride1 = H * W; - const int stride2 = W; - framework::Tensor sqr_buffer; - auto sqr_buffer_ptr = sqr_buffer.mutable_data(input.dims()); - std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0); - - for (int a = 0; a < N; a++) { -#pragma parallel for - for (int b = 0; b < C; b++) { - for (int index = start; index < end; index++) { - int channel = b + index; - if (channel >= 0 && channel < C) { - int tmp_s = a * stride0 + b * stride1; - int tmp_c = a * stride0 + channel * stride1; -#ifdef __ARM_NEON - int n4 = stride1 / 4; - int m4 = stride1 % 4; - float32x4_t sqr0; - float32x4_t in0; - float32x4_t res0; - for (int i = 0; i < n4; i++) { - sqr0 = vld1q_f32(sqr_buffer_ptr + tmp_s); - in0 = vld1q_f32(input_ptr + tmp_c); - - res0 = vmlaq_f32(sqr0, in0, in0); - vst1q_f32(sqr_buffer_ptr + tmp_s, res0); - - tmp_s += 4; - tmp_c += 4; - } - - for (int i = 0; i < m4; i++) { - int s_i = tmp_s + i; - int c_i = tmp_c + i; - sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i]; - } - -#else - for (int tmp = 0; tmp < stride1; tmp++) { - int s_i = tmp_s + tmp; - int c_i = tmp_c + tmp; - sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i]; - } -#endif - } - } - } - } - -#ifdef __ARM_NEON - - float32x4_t sqr1, sqr2, sqr3, sqr4; - float32x4_t alpha4; - float32x4_t k4; - float32x4_t beta4; - float32x4_t res1, res2, res3, res4; - float32x4_t in1, in2, in3, in4; - - beta4 = vdupq_n_f32(beta); - alpha4 = vdupq_n_f32(alpha); - k4 = vdupq_n_f32(k); - auto out_tmp_ptr = out_ptr; - - int n16 = input.numel() / 16; - int m16 = input.numel() % 16; - int m16n4 = m16 / 4; - int m16m4 = m16 % 4; - - for (int i = 0; i < n16; i++) { - sqr1 = vld1q_f32(sqr_buffer_ptr); - sqr2 = vld1q_f32(sqr_buffer_ptr + 4); - sqr3 = vld1q_f32(sqr_buffer_ptr + 8); - sqr4 = vld1q_f32(sqr_buffer_ptr + 12); - - in1 = vld1q_f32(input_ptr); - in2 = vld1q_f32(input_ptr + 4); - in3 = vld1q_f32(input_ptr + 8); - in4 = vld1q_f32(input_ptr + 12); - - sqr1 = vmlaq_f32(k4, sqr1, alpha4); - sqr2 = vmlaq_f32(k4, sqr2, alpha4); - sqr3 = vmlaq_f32(k4, sqr3, alpha4); - sqr4 = vmlaq_f32(k4, sqr4, alpha4); - - sqr1 = pow_ps(sqr1, -beta4); - sqr2 = pow_ps(sqr2, -beta4); - sqr3 = pow_ps(sqr3, -beta4); - sqr4 = pow_ps(sqr4, -beta4); - - sqr1 = vmulq_f32(sqr1, in1); - sqr2 = vmulq_f32(sqr2, in2); - sqr3 = vmulq_f32(sqr3, in3); - sqr4 = vmulq_f32(sqr4, in4); - - vst1q_f32(out_tmp_ptr, sqr1); - vst1q_f32(out_tmp_ptr + 4, sqr2); - vst1q_f32(out_tmp_ptr + 8, sqr3); - vst1q_f32(out_tmp_ptr + 12, sqr4); - - sqr_buffer_ptr += 4 * 4; - input_ptr += 4 * 4; - out_tmp_ptr += 4 * 4; - } - for (int i = 0; i < m16n4; i++) { - sqr4 = vld1q_f32(sqr_buffer_ptr); - in4 = vld1q_f32(input_ptr); - sqr4 = vmlaq_f32(k4, sqr4, alpha4); - sqr4 = pow_ps(sqr4, -beta4); - sqr4 = vmulq_f32(sqr4, in4); - vst1q_f32(out_tmp_ptr, sqr4); - sqr_buffer_ptr += 4; - input_ptr += 4; - out_tmp_ptr += 4; - } - - for (int i = 0; i < m16m4; i++) { - out_tmp_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta); - } - -#else - for (int i = 0; i < input.numel(); i++) { - out_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta); - } -#endif - } -}; - -template -class LrnKernel - : public framework::OpKernelBase> { - public: - void Compute(const LrnParam ¶m); - bool Init(LrnParam *param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/mul_kernel.h b/mobile/src/operators/kernel/mul_kernel.h deleted file mode 100644 index 8deb4a2cb7..0000000000 --- a/mobile/src/operators/kernel/mul_kernel.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MUL_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/math/math_function.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using namespace framework; - -template -class MulKernel - : public framework::OpKernelBase> { - public: - void Compute(const MulParam ¶m); - bool Init(MulParam *param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/multiclass_nms_kernel.h b/mobile/src/operators/kernel/multiclass_nms_kernel.h deleted file mode 100644 index 6a4ac0c229..0000000000 --- a/mobile/src/operators/kernel/multiclass_nms_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP - -#pragma once - -#include "framework/operator.h" - -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class MultiClassNMSKernel - : public framework::OpKernelBase> { - public: - void Compute(const MultiClassNMSParam& param); - bool Init(MultiClassNMSParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/nearest_interp_kernel.h b/mobile/src/operators/kernel/nearest_interp_kernel.h deleted file mode 100644 index cb2d186312..0000000000 --- a/mobile/src/operators/kernel/nearest_interp_kernel.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NEAREST_INTERP_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class NearestInterpolationKernel - : public framework::OpKernelBase> { - public: - void Compute(const NearestInterpolationParam& param); - bool Init(NearestInterpolationParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/norm_kernel.h b/mobile/src/operators/kernel/norm_kernel.h deleted file mode 100644 index 4f945bdb8b..0000000000 --- a/mobile/src/operators/kernel/norm_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NORM_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class NormKernel - : public framework::OpKernelBase> { - public: - void Compute(const NormParam ¶m); - bool Init(NormParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/one_hot_kernel.h b/mobile/src/operators/kernel/one_hot_kernel.h deleted file mode 100644 index 2cb2e59eb3..0000000000 --- a/mobile/src/operators/kernel/one_hot_kernel.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ONE_HOT_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class OnehotParam : public OpParam { - public: - OnehotParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = GET_VAR_AS_LOD_TENSOR("X", inputs, *scope); - output_ = GET_VAR_AS_LOD_TENSOR("Out", outputs, *scope); - - depth_ = OpParam::GetAttr("depth", attrs); - dtype_ = OpParam::GetAttr("dtype", attrs); - } - - public: - framework::LoDTensor *input_; - framework::LoDTensor *output_; - - int depth_; - int dtype_; -}; - -DECLARE_KERNEL(Onehot, OnehotParam); - -} // namespace operators -} // namespace paddle_mobile - -#endif // ONE_HOT_OP diff --git a/mobile/src/operators/kernel/pad2d_kernel.h b/mobile/src/operators/kernel/pad2d_kernel.h deleted file mode 100644 index 0d1d1408ba..0000000000 --- a/mobile/src/operators/kernel/pad2d_kernel.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PAD2D_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -// template -// class Pad2DParam : public OpParam { -// public: -// Pad2DParam(const VariableNameMap &inputs, const VariableNameMap &outputs, -// const AttributeMap &attrs, Scope *scope) -// : OpParam(inputs, outputs, attrs, scope) { -// input_ = OpParam::GetVarValue("X", inputs, *scope); -// output_ = -// OpParam::GetVarValue("Out", outputs, *scope); -// paddings_ = OpParam::GetAttr>("paddings", attrs); -// pad_value_ = OpParam::GetAttr("pad_value", attrs); -// mode_ = OpParam::GetStringAttr("mode", attrs); -// } -// -// public: -// framework::LoDTensor *input_; -// framework::LoDTensor *output_; -// std::vector paddings_; -// float pad_value_; -// std::string mode_; -//}; - -DECLARE_KERNEL(Pad2D, Pad2DParam); - -} // namespace operators -} // namespace paddle_mobile - -#endif // PAD2D_OP diff --git a/mobile/src/operators/kernel/pixel_shuffle_kernel.h b/mobile/src/operators/kernel/pixel_shuffle_kernel.h deleted file mode 100644 index 3f95c866f8..0000000000 --- a/mobile/src/operators/kernel/pixel_shuffle_kernel.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef LRN_OP - -#include -#ifdef _OPENMP -#include -#endif -#ifdef __ARM_NEON -#include -#include "operators/math/math.h" -#endif -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class PixelShuffleKernel - : public framework::OpKernelBase> { - public: - void Compute(const PixelShuffleParam ¶m); - bool Init(PixelShuffleParam *param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/polygon_box_transform_kernel.h b/mobile/src/operators/kernel/polygon_box_transform_kernel.h deleted file mode 100644 index 6ed003a4c7..0000000000 --- a/mobile/src/operators/kernel/polygon_box_transform_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POLYGONBOXTRANSFORM_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class PolygonBoxTransformKernel - : public framework::OpKernelBase> { - public: - void Compute(const PolygonBoxTransformParam& param); - bool Init(PolygonBoxTransformParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/pool_kernel.h b/mobile/src/operators/kernel/pool_kernel.h deleted file mode 100644 index ff80e0e445..0000000000 --- a/mobile/src/operators/kernel/pool_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using framework::OpKernelBase; - -template -class PoolKernel : public OpKernelBase> { - public: - void Compute(const PoolParam ¶m); - bool Init(PoolParam *param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/prelu_kernel.h b/mobile/src/operators/kernel/prelu_kernel.h deleted file mode 100644 index c043149243..0000000000 --- a/mobile/src/operators/kernel/prelu_kernel.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/operator.h" -#include "operators/op_param.h" - -#pragma once - -namespace paddle_mobile { -namespace operators { - -template -class PReluKernel - : public framework::OpKernelBase> { - public: - void Compute(const PReluParam& param); -}; -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/prior_box_kernel.h b/mobile/src/operators/kernel/prior_box_kernel.h deleted file mode 100644 index c5d561083d..0000000000 --- a/mobile/src/operators/kernel/prior_box_kernel.h +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include "framework/operator.h" -#include "operators/math/transform.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef PRIORBOX_OP -inline void ExpandAspectRatios(const std::vector &input_aspect_ratior, - bool flip, - std::vector *output_aspect_ratior) { - constexpr float epsilon = 1e-6; - output_aspect_ratior->clear(); - output_aspect_ratior->push_back(1.0f); - for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { - float ar = input_aspect_ratior[i]; - bool already_exist = false; - for (size_t j = 0; j < output_aspect_ratior->size(); ++j) { - if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) { - already_exist = true; - break; - } - } - if (!already_exist) { - output_aspect_ratior->push_back(ar); - if (flip) { - output_aspect_ratior->push_back(1.0f / ar); - } - } - } -} - -DECLARE_KERNEL(PriorBox, PriorBoxParam); -#endif // PRIORBOX_OP - -#ifdef DENSITY_PRIORBOX_OP -template -class DensityPriorBoxParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - - public: - DensityPriorBoxParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputFrom(inputs, *scope); - input_image_ = InputImageFrom(inputs, *scope); - output_boxes_ = OutputBoxesFrom(outputs, *scope); - output_variances_ = OutputVariancesFrom(outputs, *scope); - variances_ = GetAttr>("variances", attrs); - clip_ = GetAttr("clip", attrs); - flatten_to_2d_ = GetAttr("flatten_to_2d", attrs); - step_w_ = GetAttr("step_w", attrs); - step_h_ = GetAttr("step_h", attrs); - offset_ = GetAttr("offset", attrs); - fixed_sizes_ = GetAttr>("fixed_sizes", attrs); - fixed_ratios_ = GetAttr>("fixed_ratios", attrs); - densities_ = GetAttr>("densities", attrs); - } - - ~DensityPriorBoxParam() {} - - const GType *Input() const { return input_; } - const GType *InputImage() const { return input_image_; } - GType *OutputBoxes() const { return output_boxes_; } - GType *OutputVariances() const { return output_variances_; } - const bool Clip() const { return clip_; } - const bool FlattenTo2d() const { return flatten_to_2d_; } - const float StepW() const { return step_w_; } - const float StepH() const { return step_h_; } - const float Offset() const { return offset_; } - const vector &FixedSizes() const { return fixed_sizes_; } - const vector &FixedRatios() const { return fixed_ratios_; } - const vector &Densities() const { return densities_; } - const vector &Variances() const { return variances_; } - GType *getNewDensity() const { return new_density.get(); } - void setNewDensity(GType *newDensity) { new_density.reset(newDensity); } - - public: - GType *input_; - GType *input_image_; - GType *output_boxes_; - GType *output_variances_; - bool clip_; - bool flatten_to_2d_; - float step_w_; - float step_h_; - float offset_; - vector fixed_sizes_; - vector fixed_ratios_; - vector densities_; - vector variances_; - std::shared_ptr new_density; -}; - -DECLARE_KERNEL(DensityPriorBox, DensityPriorBoxParam); -#endif // DENSITY_PRIORBOX_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/quantize_kernel.h b/mobile/src/operators/kernel/quantize_kernel.h deleted file mode 100644 index d864e00d9c..0000000000 --- a/mobile/src/operators/kernel/quantize_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef QUANT_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class QuantizeKernel - : public framework::OpKernelBase> { - public: - void Compute(const QuantizeParam ¶m); - bool Init(QuantizeParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/range_kernel.cpp b/mobile/src/operators/kernel/range_kernel.cpp deleted file mode 100644 index 9384eb0195..0000000000 --- a/mobile/src/operators/kernel/range_kernel.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RANGE_OP - -#include "operators/kernel/range_kernel.h" -#include "framework/data_type.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool RangeKernel::Init(RangeParam* param) { - return true; -} - -template <> -void RangeKernel::Compute(const RangeParam& param) { - int start = param.Start()->data()[0]; - int end = param.End()->data()[0]; - int step = param.Step()->data()[0]; - auto* out = param.Output(); - - int64_t size = 0; - GetSize(start, end, step, &size); - out->Resize(framework::make_ddim({size})); - auto* out_data = out->mutable_data(); - auto value = start; - for (int64_t i = 0; i < size; ++i) { - out_data[i] = value; - value += step; - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // RANGE_OP diff --git a/mobile/src/operators/kernel/range_kernel.h b/mobile/src/operators/kernel/range_kernel.h deleted file mode 100644 index 36429461b2..0000000000 --- a/mobile/src/operators/kernel/range_kernel.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RANGE_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -inline void GetSize(float start, float end, float step, int64_t *size) { - PADDLE_MOBILE_ENFORCE(!std::equal_to()(step, 0), - "The step of range op should not be 0."); - PADDLE_MOBILE_ENFORCE( - ((start < end) && (step > 0)) || ((start > end) && (step < 0)), - "The step should be greater than 0 while start < end. And the " - "step should be less than 0 while start > end."); - *size = std::is_integral::value - ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step)) - : std::ceil(std::abs((end - start) / step)); -} - -template -class RangeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - RangeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - start_ = OpParam::GetVarValue("Start", inputs, *scope); - end_ = OpParam::GetVarValue("End", inputs, *scope); - step_ = OpParam::GetVarValue("Step", inputs, *scope); - output_ = OpParam::OutFrom(outputs, *scope); - } - - GType *Start() const { return start_; } - const GType *End() const { return end_; } - const GType *Step() const { return step_; } - GType *Output() const { return output_; } - - private: - GType *start_; - GType *end_; - GType *step_; - GType *output_; -}; - -DECLARE_KERNEL(Range, RangeParam); - -} // namespace operators -} // namespace paddle_mobile - -#endif // RANGE_OP diff --git a/mobile/src/operators/kernel/reduce_prod_kernel.cpp b/mobile/src/operators/kernel/reduce_prod_kernel.cpp deleted file mode 100644 index c40e5c4615..0000000000 --- a/mobile/src/operators/kernel/reduce_prod_kernel.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef REDUCE_PROD_OP - -#include "operators/kernel/reduce_prod_kernel.h" -#include -#include -#include "framework/data_type.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ReduceProdKernel::Init(ReduceProdParam* param) { - return true; -} - -template <> -void ReduceProdKernel::Compute(const ReduceProdParam& param) { - auto* input = param.Input(); - if (input->type() == type_id().hash_code()) { - bool reduce_all = param.isReduceAll(); - auto* output = param.Output(); - auto dim = param.getDim(); - auto* out_data = output->mutable_data(); - const auto* input_x_data = input->data(); - - auto dims = param.getDim(); - bool keep_dim = param.isKeepDim(); - - if (reduce_all) { - size_t stride = 1; - for (int j = dim[0]; j < input->dims().size(); ++j) { - stride *= input->dims()[j]; - } - auto numel = output->numel(); - for (int i = 0; i < numel; i++) { - int64_t mul = 1; - for (int j = 0; j < stride; ++j, ++input_x_data) { - mul *= (*input_x_data); - } - out_data[i] = mul; - } - } else { - // todo - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // REDUCE_PROD_OP diff --git a/mobile/src/operators/kernel/reduce_prod_kernel.h b/mobile/src/operators/kernel/reduce_prod_kernel.h deleted file mode 100644 index 73c93fdc0b..0000000000 --- a/mobile/src/operators/kernel/reduce_prod_kernel.h +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef REDUCE_PROD_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class ReduceProdParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ReduceProdParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = OpParam::InputXFrom(inputs, *scope); - output_ = OpParam::OutFrom(outputs, *scope); - reduce_all_ = GetAttr("reduce_all", attrs); - keep_dim_ = GetAttr("keep_dim", attrs); - dim_ = GetAttr>("dim", attrs); - } - - const GType *Input() const { return input_; } - - GType *Output() const { return output_; } - - bool isReduceAll() const { return reduce_all_; } - - bool isKeepDim() const { return keep_dim_; } - - const vector getDim() const { return dim_; } - - private: - GType *input_; - GType *output_; - bool reduce_all_; - bool keep_dim_; - std::vector dim_; -}; - -DECLARE_KERNEL(ReduceProd, ReduceProdParam) - -} // namespace operators -} // namespace paddle_mobile - -#endif // REDUCE_PROD_OP diff --git a/mobile/src/operators/kernel/reshape2_kernel.h b/mobile/src/operators/kernel/reshape2_kernel.h deleted file mode 100644 index c6ab3cf72a..0000000000 --- a/mobile/src/operators/kernel/reshape2_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE2_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class Reshape2Kernel - : public framework::OpKernelBase> { - public: - void Compute(const Reshape2Param& param); - bool Init(Reshape2Param* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/reshape_kernel.h b/mobile/src/operators/kernel/reshape_kernel.h deleted file mode 100644 index a540565487..0000000000 --- a/mobile/src/operators/kernel/reshape_kernel.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE_OP - -#pragma once - -#include -#include "framework/operator.h" - -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -inline framework::DDim ValidateShape(const std::vector shape, - const framework::DDim& in_dims) { - const int64_t in_size = framework::product(in_dims); - // only one dimension can be set to -1, whose size will be automatically - // infered. - const int64_t unk_dim_val = -1; - const int64_t copy_dim_val = 0; - - std::vector output_shape(shape.size(), 0); - int64_t capacity = 1; - int unk_dim_idx = -1; - for (size_t i = 0; i < shape.size(); ++i) { - if (shape[i] == unk_dim_val) { - PADDLE_MOBILE_ENFORCE( - unk_dim_idx == -1, - "Only one input dimension of Attr(shape) can be unknown."); - unk_dim_idx = i; - } else if (shape[i] == copy_dim_val) { - PADDLE_MOBILE_ENFORCE( - static_cast(i) < in_dims.size(), - "The index of dimension to copy from input shape must be less " - "than the size of input shape."); - } else { - PADDLE_MOBILE_ENFORCE( - shape[i] > 0, - "Each input dimension of Attr(shape) must not be negtive except " - "one unknown dimension."); - } - - capacity *= (shape[i] ? shape[i] : in_dims[i]); - output_shape[i] = (shape[i] ? static_cast(shape[i]) : in_dims[i]); - } - - if (unk_dim_idx != -1) { - output_shape[unk_dim_idx] = -in_size / capacity; - PADDLE_MOBILE_ENFORCE(output_shape[unk_dim_idx] * capacity == -in_size, - "Invalid shape is given."); - } else { - PADDLE_MOBILE_ENFORCE(capacity == in_size, "Invalid shape is given."); - } - return framework::make_ddim(output_shape); -} - -template -class ReshapeKernel - : public framework::OpKernelBase> { - public: - void Compute(const ReshapeParam& param); - bool Init(ReshapeParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/resize_kernel.h b/mobile/src/operators/kernel/resize_kernel.h deleted file mode 100644 index b25a0dcef5..0000000000 --- a/mobile/src/operators/kernel/resize_kernel.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESIZE_OP - -#pragma once - -#include -#include "framework/operator.h" - -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -inline framework::DDim CalOutputShape(const ResizeParam ¶m) { - const auto *input_x = param.InputX(); - const auto &input_x_dims = input_x->dims(); - auto *out = param.Out(); - framework::DDim out_dims = out->dims(); - const auto *input_shape = param.InputShape(); - - if (input_shape) { - input_x->dims()[0]; - auto *shape_data = input_shape->template data(); - framework::Tensor cpu_shape_tensor; - auto shape = - std::vector(shape_data, shape_data + input_shape->numel()); - const int in_batch_size = input_x->dims()[0]; - const int in_chan_size = input_x->dims()[1]; - const int in_height = input_x->dims()[2]; - const int in_width = input_x->dims()[3]; - - int out_height = 0; - int out_width = 0; - bool is_pyramid_test = param.IsPyramidTest(); - if (is_pyramid_test == false) { - out_height = param.Height(); - out_width = param.Width(); - PADDLE_MOBILE_ENFORCE(out_height > 0, "output height is required"); - PADDLE_MOBILE_ENFORCE(out_width > 0, "output width is required"); - - } else { - float out_height_scale = param.OutHeightScale(); - float out_width_scale = param.OutWidthScale(); - PADDLE_MOBILE_ENFORCE(out_height_scale > 0, - "output height scale is required"); - PADDLE_MOBILE_ENFORCE(out_width_scale > 0, - "output width scale is required"); - - out_height = int(out_height_scale * in_height); - out_width = int(out_width_scale * in_width); - } - - out_dims = framework::make_ddim( - {in_batch_size, in_chan_size, in_height, in_width}); - } - return out_dims; -} - -template -class ResizeKernel - : public framework::OpKernelBase> { - public: - void Compute(const ResizeParam ¶m); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/scale_kernel.h b/mobile/src/operators/kernel/scale_kernel.h deleted file mode 100644 index 4b0c8f457c..0000000000 --- a/mobile/src/operators/kernel/scale_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SCALE_OP - -#include "framework/operator.h" -#include "operators/op_param.h" - -#pragma once - -namespace paddle_mobile { -namespace operators { - -template -class ScaleKernel - : public framework::OpKernelBase> { - public: - void Compute(const ScaleParam& param); - bool Init(ScaleParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/sequence_kernels.h b/mobile/src/operators/kernel/sequence_kernels.h deleted file mode 100644 index ccee8c5216..0000000000 --- a/mobile/src/operators/kernel/sequence_kernels.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef SEQUENCE_EXPAND_OP -DECLARE_KERNEL(SequenceExpand, SequenceExpandParam); -#endif // SEQUENCE_EXPAND_OP - -#ifdef SEQUENCE_POOL_OP -DECLARE_KERNEL(SequencePool, SequencePoolParam); -#endif // SEQUENCE_POOL_OP - -#ifdef SEQUENCE_SOFTMAX_OP -DECLARE_KERNEL(SequenceSoftmax, SoftmaxParam); -#endif // SEQUENCE_SOFTMAX_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/shape_kernel.h b/mobile/src/operators/kernel/shape_kernel.h deleted file mode 100644 index 9d3c6e1701..0000000000 --- a/mobile/src/operators/kernel/shape_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SHAPE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class ShapeKernel - : public framework::OpKernelBase> { - public: - void Compute(const ShapeParam& param); - bool Init(ShapeParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/slice_kernel.h b/mobile/src/operators/kernel/slice_kernel.h deleted file mode 100644 index 89dba51d9e..0000000000 --- a/mobile/src/operators/kernel/slice_kernel.h +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "framework/operator.h" -#include "operators/op_param.h" - -#pragma once - -namespace paddle_mobile { -namespace operators { - -template -class SliceKernel - : public framework::OpKernelBase> { - public: - void Compute(const SliceParam& param); - bool Init(SliceParam* param); -}; -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/softmax_kernel.h b/mobile/src/operators/kernel/softmax_kernel.h deleted file mode 100644 index d7d7435fd5..0000000000 --- a/mobile/src/operators/kernel/softmax_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using framework::OpKernelBase; - -template -class SoftmaxKernel - : public OpKernelBase> { - public: - void Compute(const SoftmaxParam ¶m); - bool Init(SoftmaxParam *param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/split_kernel.h b/mobile/src/operators/kernel/split_kernel.h deleted file mode 100644 index 3a2c03dce7..0000000000 --- a/mobile/src/operators/kernel/split_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SPLIT_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class SplitKernel - : public framework::OpKernelBase> { - public: - void Compute(const SplitParam& param); - bool Init(SplitParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/sum_kernel.h b/mobile/src/operators/kernel/sum_kernel.h deleted file mode 100644 index 967d6f8307..0000000000 --- a/mobile/src/operators/kernel/sum_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SUM_OP - -#pragma once -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class SumKernel - : public framework::OpKernelBase> { - public: - void Compute(const SumParam ¶m); - bool Init(SumParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/tanh_kernel.h b/mobile/src/operators/kernel/tanh_kernel.h deleted file mode 100644 index 035f64f840..0000000000 --- a/mobile/src/operators/kernel/tanh_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef TANH_OP - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::OpKernelBase; - -template -class TanhKernel : public OpKernelBase> { - public: - void Compute(const TanhParam& param); - bool Init(TanhParam* param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/tensor_array_read_write_kernel.h b/mobile/src/operators/kernel/tensor_array_read_write_kernel.h deleted file mode 100644 index 8b666c0b40..0000000000 --- a/mobile/src/operators/kernel/tensor_array_read_write_kernel.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef WRITE_TO_ARRAY_OP -DECLARE_KERNEL(WriteToArray, WriteToArrayParam); -#endif // WRITE_TO_ARRAY_OP - -#ifdef READ_FROM_ARRAY_OP -DECLARE_KERNEL(ReadFromArray, ReadFromArrayParam); -#endif // READ_FROM_ARRAY_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/transpose2_kernel.h b/mobile/src/operators/kernel/transpose2_kernel.h deleted file mode 100644 index a1fb186db0..0000000000 --- a/mobile/src/operators/kernel/transpose2_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TRANSPOSE2_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class Transpose2Kernel - : public framework::OpKernelBase> { - public: - void Compute(const Transpose2Param& param); - bool Init(Transpose2Param* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/transpose_kernel.h b/mobile/src/operators/kernel/transpose_kernel.h deleted file mode 100644 index 63ee6eb172..0000000000 --- a/mobile/src/operators/kernel/transpose_kernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TRANSPOSE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class TransposeKernel - : public framework::OpKernelBase> { - public: - void Compute(const TransposeParam& param); - bool Init(TransposeParam* param); -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/kernel/while_kernel.h b/mobile/src/operators/kernel/while_kernel.h deleted file mode 100644 index 6882ef047f..0000000000 --- a/mobile/src/operators/kernel/while_kernel.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/operator.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef WHILE_OP -template -class WhileParam : public OpParam { - public: - WhileParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : scope_(scope), OpParam(inputs, outputs, attrs, scope) { - cond_ = - OpParam::GetVarValue("Condition", inputs, *scope); - sub_block_ = OpParam::GetAttr("sub_block", attrs); - is_test = OpParam::GetAttr("is_test", attrs); - } - - public: - Scope *scope_; - framework::LoDTensor *cond_; - framework::BlockDesc *sub_block_; - bool is_test; -}; - -DECLARE_KERNEL(While, WhileParam); -#endif // WHILE_OP - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/lod_reset_op.cpp b/mobile/src/operators/lod_reset_op.cpp deleted file mode 100644 index c4100ba8d7..0000000000 --- a/mobile/src/operators/lod_reset_op.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LOD_RESET_OP - -#include "operators/lod_reset_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void LodResetOp::InferShape() const { - const auto &input_dims = this->param_.input_x_->dims(); - this->param_.output_->Resize(input_dims); - if (std::is_same, Dtype>::value) { - if (this->param_.append) { - this->param_.output_->set_lod(this->param_.input_x_->lod()); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(lod_reset, ops::LodResetOp); -#endif - -#endif // LOD_RESET_OP diff --git a/mobile/src/operators/lod_reset_op.h b/mobile/src/operators/lod_reset_op.h deleted file mode 100644 index 46932dcfab..0000000000 --- a/mobile/src/operators/lod_reset_op.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LOD_RESET_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/kernels.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(LodReset, LodResetParam, LodResetKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif // LOD_RESET_OP diff --git a/mobile/src/operators/logical_op.cpp b/mobile/src/operators/logical_op.cpp deleted file mode 100644 index 6478516be0..0000000000 --- a/mobile/src/operators/logical_op.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/logical_op.h" - -namespace paddle_mobile { -namespace operators { - -#define DEFINE_LOGICAL_INFERSHAPE(OpName) \ - template \ - void OpName##Op::InferShape() const { \ - const auto &input_dims = this->param_.InputX()->dims(); \ - this->param_.Out()->Resize(input_dims); \ - } - -#ifdef LOGICAL_AND_OP -DEFINE_LOGICAL_INFERSHAPE(LogicalAnd); -#endif // TLOGICAL_AND_OP - -#ifdef LOGICAL_OR_OP -DEFINE_LOGICAL_INFERSHAPE(LogicalOr); -#endif // TLOGICAL_OR_OP - -#ifdef LOGICAL_NOT_OP -DEFINE_LOGICAL_INFERSHAPE(LogicalNot); -#endif // LOGICAL_NOT_OP - -#ifdef LOGICAL_XOR_OP -DEFINE_LOGICAL_INFERSHAPE(LogicalXor); -#endif // TLOGICAL_XOR_OP - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef LOGICAL_AND_OP -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(logical_and, ops::LogicalAndOp); -#endif -#endif // LOGICAL_AND_OP - -#ifdef LOGICAL_OR_OP -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(logical_or, ops::LogicalOrOp); -#endif -#endif // LOGICAL_OR_OP - -#ifdef LOGICAL_NOT_OP -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(logical_not, ops::LogicalNotOp); -#endif -#endif // LOGICAL_NOT_OP - -#ifdef LOGICAL_XOR_OP -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(logical_xor, ops::LogicalXorOp); -#endif -#endif // LOGICAL_XOR_OP diff --git a/mobile/src/operators/logical_op.h b/mobile/src/operators/logical_op.h deleted file mode 100644 index a3cd2fb605..0000000000 --- a/mobile/src/operators/logical_op.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/logical_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef LOGICAL_AND_OP -DECLARE_OPERATOR(LogicalAnd, LogicalBinaryParam, LogicalAndKernel); -#endif - -#ifdef LOGICAL_OR_OP -DECLARE_OPERATOR(LogicalOr, LogicalBinaryParam, LogicalOrKernel); -#endif - -#ifdef LOGICAL_NOT_OP -DECLARE_OPERATOR(LogicalNot, LogicalUnaryParam, LogicalNotKernel); -#endif - -#ifdef LOGICAL_XOR_OP -DECLARE_OPERATOR(LogicalXor, LogicalBinaryParam, LogicalXorKernel); -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/lookup_op.cpp b/mobile/src/operators/lookup_op.cpp deleted file mode 100644 index 682e71221e..0000000000 --- a/mobile/src/operators/lookup_op.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LOOKUP_OP - -#include - -#include "common/enforce.h" -#include "operators/lookup_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void LookupOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.InputW() != nullptr, - "Input(W) of LookupTableOp should not be null."); - auto *ids_t = this->param_.InputIds(); - - PADDLE_MOBILE_ENFORCE(ids_t != nullptr, - "Input(Ids) of LookupTableOp should not be null."); - PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr, - "Output(Out) of LookupTableOp should not be null."); - // this->param__.InputW()-> - - auto table_dims = this->param_.InputW()->dims(); - auto ids_dims = ids_t->dims(); - - int ids_rank = ids_dims.size(); - - PADDLE_MOBILE_ENFORCE(table_dims.size() == 2, - "table_dims.size()==2 check failed"); - - PADDLE_MOBILE_ENFORCE(ids_dims[ids_rank - 1] == 1, - "The last dimension of the 'Ids' tensor must be 1."); - - auto output_dims = - framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); - output_dims.push_back(table_dims[1]); - - this->param_.Out()->Resize(framework::make_ddim(output_dims)); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(lookup_table, ops::LookupOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -#endif - -#endif diff --git a/mobile/src/operators/lookup_op.h b/mobile/src/operators/lookup_op.h deleted file mode 100644 index e99936a711..0000000000 --- a/mobile/src/operators/lookup_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LOOKUP_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/lookup_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class LookupOp : public framework::OperatorWithKernel< - DeviceType, LookupParam, - operators::LookupKernel> { - public: - LookupOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::LookupKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/lrn_op.cpp b/mobile/src/operators/lrn_op.cpp deleted file mode 100644 index 9b0745b113..0000000000 --- a/mobile/src/operators/lrn_op.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LRN_OP - -#include "operators/lrn_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void LrnOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - this->param_.Out()->Resize(x_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(lrn, ops::LrnOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(lrn, ops::LrnOp); -#endif - -#endif diff --git a/mobile/src/operators/lrn_op.h b/mobile/src/operators/lrn_op.h deleted file mode 100644 index dde4b968af..0000000000 --- a/mobile/src/operators/lrn_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef LRN_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/lrn_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class LrnOp : public framework::OperatorWithKernel< - DeviceType, LrnParam, - operators::LrnKernel> { - public: - LrnOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::LrnKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/activation.h b/mobile/src/operators/math/activation.h deleted file mode 100644 index d2b465c2bc..0000000000 --- a/mobile/src/operators/math/activation.h +++ /dev/null @@ -1,187 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "common/enforce.h" -#include "common/types.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#include "operators/math/math.h" -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { - -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - -inline ActivationType GetActivationType(const std::string &type) { - if (type == "sigmoid") { - return ActivationType::SIGMOID; - } else if (type == "relu") { - return ActivationType::RELU; - } else if (type == "tanh") { - return ActivationType::TANH; - } else if (type == "identity" || type == "") { - return ActivationType::IDENTITY; - } - PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type."); -} - -inline ActivationType GetActivationType(const int type) { - if (type == 0) { - return ActivationType::IDENTITY; - } else if (type == 1) { - return ActivationType::SIGMOID; - } else if (type == 2) { - return ActivationType::TANH; - } else if (type == 3) { - return ActivationType::RELU; - } - PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type."); -} - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -template -inline float32x4_t vActiveq_f32(const float32x4_t &x) { - return x; -} - -template <> -inline float32x4_t vActiveq_f32(const float32x4_t &x) { - float32x4_t __zero = vdupq_n_f32(0.f); - return vmaxq_f32(x, __zero); -} - -template <> -inline float32x4_t vActiveq_f32(const float32x4_t &x) { - float32x4_t __zero = vdupq_n_f32(0.f); - float32x4_t __six = vdupq_n_f32(6.f); - return vminq_f32(vmaxq_f32(x, __zero), __six); -} - -template <> -inline float32x4_t vActiveq_f32(const float32x4_t &x) { - float32x4_t __one = vdupq_n_f32(1.f); - float32x4_t __x = vnegq_f32(x); - __x = exp_ps(__x); - __x = vaddq_f32(__x, __one); - float32x4_t __out = vrecpeq_f32(__x); - return vmulq_f32(vrecpsq_f32(__x, __out), __out); -} - -template <> -inline float32x4_t vActiveq_f32(const float32x4_t &x) { - float32x4_t __one = vdupq_n_f32(1.f); - float32x4_t __x = vnegq_f32(x); - __x = vmulq_n_f32(__x, 2.f); - __x = exp_ps(__x); - __x = vaddq_f32(__x, __one); - float32x4_t __out = vrecpeq_f32(__x); - __out = vmulq_f32(vrecpsq_f32(__x, __out), __out); - __out = vmulq_n_f32(__out, 2.f); - return vsubq_f32(__out, __one); -} - -template <> -inline float32x4_t vActiveq_f32(const float32x4_t &x) { - return log_ps(x); -} - -template -inline float32x4_t vActiveq_f32(const float32x4_t &x, - const float32x4_t &alpha) { - return x; -} - -template <> -inline float32x4_t vActiveq_f32(const float32x4_t &x, - const float32x4_t &alpha) { - return vmaxq_f32(x, vmulq_f32(x, alpha)); -} - -template <> -inline float32x4_t vActiveq_f32(const float32x4_t &x, - const float32x4_t &alpha) { - float32x4_t __zero = vdupq_n_f32(0.f); - float32x4_t __threshold = vdupq_n_f32(vgetq_lane_f32(alpha, 0)); - return vminq_f32(vmaxq_f32(x, __zero), __threshold); -} -#endif - -template -inline float Active(const float &x) { - return x; -} - -template -inline int Active(const int &x) { - return x; -} - -template <> -inline float Active(const float &x) { - return std::max(x, 0.f); -} - -template <> -inline float Active(const float &x) { - return std::min(std::max(x, 0.f), 6.f); -} - -template <> -inline float Active(const float &x) { - // float tmp = x > SIGMOID_THRESHOLD_MAX ? SIGMOID_THRESHOLD_MAX : x; - // tmp = x > SIGMOID_THRESHOLD_MIN ? x : SIGMOID_THRESHOLD_MIN; - // return 1.f / (1.f + exp(-tmp)); - return 1.f / (1.f + exp(-x)); -} - -template <> -inline float Active(const float &x) { - // float tmp = -2.f * x; - // tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - // return (2.f / (1.f + exp(tmp))) - 1.f; - return 2.f / (1.f + exp(-2.f * x)) - 1.f; -} - -template <> -inline float Active(const float &x) { - return log(x); -} - -template -inline float Active(const float &x, const float &alpha) { - return x; -} - -template <> -inline float Active(const float &x, const float &alpha) { - return std::max(x, alpha * x); -} - -template <> -inline float Active(const float &x, const float &alpha) { - return std::min(std::max(x, 0.f), alpha); -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h b/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h deleted file mode 100644 index 25011b9f01..0000000000 --- a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { -namespace depthwise { - -void conv_depthwise_3x3p1(const float* din, float* dout, int num, int ch_out, - int h_out, int w_out, int ch_in, int h_in, int w_in, - const float* weights, const float* bias, int stride, - bool flag_bias, bool flag_relu); - -} // namespace depthwise -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp b/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp deleted file mode 100644 index 4f3bebd9bf..0000000000 --- a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp +++ /dev/null @@ -1,2011 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#include -#include "framework/context.h" -#include "operators/math/depthwise/faster_depthwise_conv3x3.h" - -namespace paddle_mobile { -namespace operators { -namespace math { -namespace depthwise { - -void conv_depthwise_3x3s1p1_bias_relu(float *dout, const float *din, - const float *weights, const float *bias, - bool flag_bias, const int num, - const int ch_in, const int h_in, - const int w_in, const int h_out, - const int w_out); - -//! for input width <= 4 -void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, const float *din, - const float *weights, const float *bias, - bool flag_bias, const int num, - const int ch_in, const int h_in, - const int w_in, const int h_out, - const int w_out); - -void conv_depthwise_3x3s2p1_bias_relu(float *dout, const float *din, - const float *weights, const float *bias, - bool flag_bias, const int num, - const int ch_in, const int h_in, - const int w_in, const int h_out, - const int w_out); - -void conv_depthwise_3x3p1(const float *din, float *dout, int num, int ch_out, - int h_out, int w_out, int ch_in, int h_in, int w_in, - const float *weights, const float *bias, int stride, - bool flag_bias, bool flag_relu) { - if (stride == 1) { - if (flag_relu) { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias_relu(dout, din, weights, bias, flag_bias, - num, ch_in, h_in, w_in, h_out, w_out); - } else { - conv_depthwise_3x3s1p1_bias_s_relu(dout, din, weights, bias, flag_bias, - num, ch_in, h_in, w_in, h_out, - w_out); - } - } - } else { //! stride = 2 - if (flag_relu) { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias_relu(dout, din, weights, bias, flag_bias, - num, ch_in, h_in, w_in, h_out, w_out); - } - } - } -} - -// 4line -void conv_depthwise_3x3s1p1_bias_relu(float *dout, const float *din, - const float *weights, const float *bias, - bool flag_bias, const int num, - const int ch_in, const int h_in, - const int w_in, const int h_out, - const int w_out) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - // printf("conv3x3_dw start \n"); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int tile_h = (h_in + 3) >> 2; - int cnt_col = tile_w - 2; - float *zero_ptr = static_cast( - framework::CPUContext::Context()->get_work_space(w_in * sizeof(float))); - memset(zero_ptr, 0, w_in * sizeof(float)); - float *write_ptr = zero_ptr + w_in; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float *dout_ptr = dout_batch + c * size_out_channel; - - const float *din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float *wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float *doutr0 = dout_ptr; - float *doutr1 = doutr0 + w_out; - float *doutr2 = doutr1 + w_out; - float *doutr3 = doutr2 + w_out; - - const float *dr0 = din_ch_ptr; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - const float *dr4 = dr3 + w_in; - const float *dr5 = dr4 + w_in; - - const float *din_ptr0 = dr0; - const float *din_ptr1 = dr1; - const float *din_ptr2 = dr2; - const float *din_ptr3 = dr3; - const float *din_ptr4 = dr4; - const float *din_ptr5 = dr5; - - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - - // left - // r0 - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * - w0[1]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * - w0[0]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * - w0[2]*/ - - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * - w1[1]*/ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ - - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ - "cmp %[cnt], #1 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "blt 3f \n" - // mid - "1: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 1b \n" - - // right - "3: \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v18.4s}, [%[rmask]] \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - : [cnt] "+r"(cnt), [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), [w1] "w"(wr1), [w2] "w"(wr2), - [bias_val] "r"(vbias), [vmask] "r"(vmask), [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float *din_channel = din_batch + i * size_in_channel; - - const float *weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float *dout_channel = dout_batch + i * size_out_channel; - - const float *dr0 = din_channel; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - - const float *din0_ptr = nullptr; - const float *din1_ptr = nullptr; - const float *din2_ptr = nullptr; - const float *din3_ptr = nullptr; - - float *doutr0 = nullptr; - float *doutr1 = nullptr; - - float *ptr_zero = const_cast(zero); - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int *rmask_ptr = rmask; - unsigned int *vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" - "vext.32 q7, q8, q9, #1 @ 1234\n" - - // left - // r0 - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" - - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" - "vext.32 q7, q10, q11, #1 @ 1234\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" - "vext.32 q7, q12, q13, #1 @ 1234\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" - "vext.32 q7, q14, q15, #1 @ 1234\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "cmp %[cnt], #1 @ check whether has " - "mid cols\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - "blt 3f @ jump to main loop start " - "point\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - : [dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), [rmask] "+r"(rmask_ptr), [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), - [bias_val] "r"(bias_val), [vzero] "w"(vzero) - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, with reulu - */ -// w_in > 7 -void conv_depthwise_3x3s2p1_bias_relu(float *dout, const float *din, - const float *weights, const float *bias, - bool flag_bias, const int num, - const int ch_in, const int h_in, - const int w_in, const int h_out, - const int w_out) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float *zero_ptr = static_cast( - framework::CPUContext::Context()->get_work_space(w_in * sizeof(float))); - memset(zero_ptr, 0, w_in * sizeof(float)); - float *write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float *din_channel = din_batch + i * size_in_channel; - float *dout_channel = dout_batch + i * size_out_channel; - - const float *weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float *dr0 = din_channel; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - const float *dr4 = dr3 + w_in; - - const float *din0_ptr = dr0; - const float *din1_ptr = dr1; - const float *din2_ptr = dr2; - const float *din3_ptr = dr3; - const float *din4_ptr = dr4; - - float *doutr0 = dout_channel; - float *doutr0_ptr = nullptr; - float *doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" // v10 = {0,1,3,5} - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmul v12.4s, v1.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr0], %[inptr0], #4 \n" - "sub %[inptr1], %[inptr1], #4 \n" - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v12.4s, v3.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr2], %[inptr2], #4 \n" - "sub %[inptr3], %[inptr3], #4 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmla v11.4s, v4.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - - "fmul v14.4s, v5.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v12.4s, v5.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - - "fmla v17.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - "fmla v16.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr4], %[inptr4], #4 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v7.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" // v10 = {0,1,3,5} - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v9.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "cmp %[cnt], #1 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "blt 1f \n" - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v14.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), [cnt] "+r"(cnt) - : [vzero] "w"(vzero), [w0] "w"(wr0), [w1] "w"(wr1), [w2] "w"(wr2), - [remain] "r"(cnt_remain), [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), [wmask] "w"(wmask), [vbias] "w"(wbias) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", - "v17", "v18", "v19", "v20", "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - - unsigned int *mask_ptr = dmask; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q10, q11 - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q12, q13 - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v13={0,2,4,6} v14={1,3,5,7}, q14, q15 - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vext.32 q6, q9, q11, #3 @ shift right 1 " - "data\n" // q2 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, " - "out1\n" // q0 * w01 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, " - "out1\n" // q1 * w02 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, " - "out1\n" // q2 * w00 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "cmp %[cnt], #1 \n" - "blt 1f \n" - // mid - "2: \n" - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "subs %[cnt], #1 \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), [wr0] "w"(wr0), [wr1] "w"(wr1), - [wr2] "w"(wr2), [bias] "r"(bias_c) - : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, const float *din, - const float *weights, const float *bias, - bool flag_bias, const int num, - const int ch_in, const int h_in, - const int w_in, const int h_out, - const int w_out) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float *dout_channel = dout_batch + i * size_out_channel; - const float *din_channel = din_batch + i * size_in_channel; - const float *weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float *doutr0 = dout_channel; - float *doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float *dr0 = din_channel + hs * w_in; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s}, [%[din0]], #16\n" - "ld1 {v1.4s}, [%[din1]], #16\n" - "ld1 {v2.4s}, [%[din2]], #16\n" - "ld1 {v3.4s}, [%[din3]], #16\n" - - "bif v0.16b, %[zero].16b, %[mask].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask].16b\n" // d1_1234 - "bif v2.16b, %[zero].16b, %[mask].16b\n" // d2_1234 - "bif v3.16b, %[zero].16b, %[mask].16b\n" // d3_1234 - - "ext v4.16b, %[zero].16b, v0.16b, #12\n" // d0_0123 - "ext v5.16b, %[zero].16b, v1.16b, #12\n" // d1_0123 - "ext v6.16b, %[zero].16b, v2.16b, #12\n" // d2_0123 - "ext v7.16b, %[zero].16b, v3.16b, #12\n" // d3_0123 - - "ext v8.16b, v0.16b, %[zero].16b, #4\n" // d0_2340 - "ext v9.16b, v1.16b, %[zero].16b, #4\n" // d1_2340 - "ext v10.16b, v2.16b, %[zero].16b, #4\n" // d2_2340 - "ext v11.16b, v3.16b, %[zero].16b, #4\n" // d3_2340 - - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" - - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" - - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" - - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" - - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" - - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" - - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" - - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" - - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" - - "fadd v12.4s, v12.4s, v14.4s\n" - "fadd v12.4s, v12.4s, v16.4s\n" - - "fadd v13.4s, v13.4s, v15.4s\n" // out1 - "fadd v13.4s, v13.4s, v17.4s\n" // out2 - - "fadd v12.4s, v12.4s, %[bias].4s\n" // out1 add bias - "fadd v13.4s, v13.4s, %[bias].4s\n" // out2 add bias - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "fmax v12.4s, v12.4s, %[zero].4s\n" // out1 -> relu - "fmax v13.4s, v13.4s, %[zero].4s\n" // out2 -> relu - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - - : [din0] "+r"(dr0), [din1] "+r"(dr1), [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), [zero] "w"(vzero), - [mask] "w"(vmask_rp), [bias] "w"(wbias), [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", - "v17"); -#else - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d12-d13}, [%[din0]]!\n" - "vld1.32 {d14-d15}, [%[din1]]!\n" - "vld1.32 {d16-d17}, [%[din2]]!\n" - "vld1.32 {d18-d19}, [%[din3]]!\n" - - "vbif q6, %q[zero], %q[mask]\n" // d0_1234 - "vbif q7, %q[zero], %q[mask]\n" // d1_1234 - "vbif q8, %q[zero], %q[mask]\n" // d2_1234 - "vbif q9, %q[zero], %q[mask]\n" // d3_1234 - - "vmul.f32 q14, q6, %e[wr0][1]\n" - "vmul.f32 q15, q7, %e[wr0][1]\n" - - "vmla.f32 q14, q7, %e[wr1][1]\n" - "vmla.f32 q15, q8, %e[wr1][1]\n" - - "vmla.f32 q14, q8, %e[wr2][1]\n" - "vmla.f32 q15, q9, %e[wr2][1]\n" - - "vext.32 q10, %q[zero], q6, #3\n" // d0_0123 - "vext.32 q11, %q[zero], q7, #3\n" // d1_0123 - "vext.32 q12, %q[zero], q8, #3\n" // d2_0123 - "vext.32 q13, %q[zero], q9, #3\n" // d3_0123 - - "vmla.f32 q14, q10, %e[wr0][0]\n" - "vmla.f32 q15, q11, %e[wr0][0]\n" - - "vmla.f32 q14, q11, %e[wr1][0]\n" - "vmla.f32 q15, q12, %e[wr1][0]\n" - - "vmla.f32 q14, q12, %e[wr2][0]\n" - "vmla.f32 q15, q13, %e[wr2][0]\n" - - "vext.32 q10, q6, %q[zero], #1\n" // d0_2340 - "vext.32 q11, q7, %q[zero], #1\n" // d1_2340 - "vext.32 q12, q8, %q[zero], #1\n" // d2_2340 - "vext.32 q13, q9, %q[zero], #1\n" // d3_2340 - - "vmla.f32 q14, q10, %f[wr0][0]\n" - "vmla.f32 q15, q11, %f[wr0][0]\n" - - "vmla.f32 q14, q11, %f[wr1][0]\n" - "vmla.f32 q15, q12, %f[wr1][0]\n" - - "vmla.f32 q14, q12, %f[wr2][0]\n" // out1 - "vmla.f32 q15, q13, %f[wr2][0]\n" // out2 - - "vadd.f32 q14, q14, %q[bias]\n" // out1 add bias - "vadd.f32 q15, q15, %q[bias]\n" // out2 add bias - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmax.f32 q14, q14, %q[zero]\n" // out1 -> relu - "vmax.f32 q15, q15, %q[zero]\n" // out2 -> relu - - "vst1.32 {d28-d29}, [%[out1]]\n" - "vst1.32 {d30-d31}, [%[out2]]\n" - - : [din0] "+r"(dr0), [din1] "+r"(dr1), [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), [zero] "w"(vzero), - [mask] "w"(vmask_rp), [bias] "w"(wbias), [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", - "q13", "q14", "q15"); -#endif //__aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - }; - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -} // namespace depthwise -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/depthwise_conv3x3.cpp b/mobile/src/operators/math/depthwise_conv3x3.cpp deleted file mode 100644 index 4f8b7a7b30..0000000000 --- a/mobile/src/operators/math/depthwise_conv3x3.cpp +++ /dev/null @@ -1,1062 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#include "operators/math/depthwise_conv3x3.h" -#include - -namespace paddle_mobile { -namespace operators { -namespace math { - -#ifndef __aarch64__ -inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) { - float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0)); - float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1)); - return vcombine_f32(sum0, sum1); -} -#endif - -template -inline void Depth3x3NormalRowLoadInput(const float *input, float32x4_t *y) { - y[0] = vld1q_f32(input); - y[2] = vld1q_f32(input + 4); - y[1] = vextq_f32(y[0], y[2], 1); - y[2] = vextq_f32(y[0], y[2], 2); -} - -template <> -inline void Depth3x3NormalRowLoadInput<2>(const float *input, float32x4_t *y) { - float32x4x2_t x = vld2q_f32(input); - y[0] = x.val[0]; - y[1] = x.val[1]; - y[2] = vextq_f32(y[0], y[0], 1); - y[2] = vsetq_lane_f32(input[8], y[2], 3); -} - -#define DEPTHWISE_CONV3X3_NORMAL_BORDER(start, end) \ - for (int w = start; w < end; ++w) { \ - const int w_in_start = -padding_w + w * Stride_w; \ - const int w_in_end = w_in_start + 3; \ - const int w_start = w_in_start > 0 ? w_in_start : 0; \ - const int w_end = w_in_end < input_w ? w_in_end : input_w; \ - float value = 0; \ - for (int h_in = h_start; h_in < h_end; ++h_in) { \ - for (int w_in = w_start; w_in < w_end; ++w_in) { \ - value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \ - input[h_in * input_w + w_in]; \ - } \ - } \ - output_ptr[w] = value; \ - } - -template -inline void DepthwiseConv3x3NormalRow(const float *input, const float *filter, - const int h_output, const int input_h, - const int input_w, const int padding_h, - const int padding_w, const int output_w, - float *output, float32x4_t *ker) { - const int h_in_start = -padding_h + h_output * Stride_h; - const int h_in_end = h_in_start + 3; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int h_end = h_in_end < input_h ? h_in_end : input_h; - - int valid_w_start = (padding_w + Stride_w - 1) / Stride_w; - int valid_w_end = (input_w + padding_w - 3) / Stride_w + 1; - if (valid_w_end < valid_w_start) { - valid_w_end = valid_w_start; - } - // const int valid_w_end = output_w - valid_w_start; - float *output_ptr = output + h_output * output_w; - // border left - DEPTHWISE_CONV3X3_NORMAL_BORDER(0, valid_w_start) - // middle - int output_tiles = (valid_w_end - valid_w_start) >> 2; - float32x4_t _sum, _x[3]; - // valid w - for (int w = 0; w < output_tiles * 4; w += 4) { - _sum = vdupq_n_f32(0.f); - int output_offset = valid_w_start + w; - int input_w_offset = output_offset * Stride_w - padding_w; - for (int h_in = h_start; h_in < h_end; ++h_in) { - int index = h_in - h_in_start; - Depth3x3NormalRowLoadInput( - input + h_in * input_w + input_w_offset, _x); - _sum = vmlaq_lane_f32(_sum, _x[0], vget_low_f32(ker[index]), 0); - _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 1); - _sum = vmlaq_lane_f32(_sum, _x[2], vget_high_f32(ker[index]), 0); - } - vst1q_f32(output_ptr + output_offset, _sum); - } - // remain valid w - int remain = (valid_w_end - valid_w_start) & 0x3; - if (remain > 0) { - _sum = vdupq_n_f32(0.f); - int remain_start = valid_w_start + (output_tiles << 2); - int input_w_offset = remain_start * Stride_w - padding_w; - float *output_ptr0 = output_ptr + remain_start; - - for (int h_in = h_start; h_in < h_end; ++h_in) { - int index = h_in - h_in_start; - Depth3x3NormalRowLoadInput( - input + h_in * input_w + input_w_offset, _x); - _sum = vmlaq_lane_f32(_sum, _x[0], vget_low_f32(ker[index]), 0); - _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 1); - _sum = vmlaq_lane_f32(_sum, _x[2], vget_high_f32(ker[index]), 0); - } - switch (remain) { - case 3: - vst1q_lane_f32(output_ptr0 + 2, _sum, 2); - case 2: - vst1_f32(output_ptr0, vget_low_f32(_sum)); - break; - case 1: - vst1q_lane_f32(output_ptr0, _sum, 0); - break; - } - } - // border right - DEPTHWISE_CONV3X3_NORMAL_BORDER(valid_w_end, output_w) -} - -template <> -void DepthwiseConv3x3S1(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) { - const float *input_data = input.data(); - const float *filter_data = filter.data(); - float *out_data = output->mutable_data(); - - const int input_h = input.dims()[2]; - const int input_w = input.dims()[3]; - const int output_h = output->dims()[2]; - const int output_w = output->dims()[3]; - const int padding_h = paddings[0]; - const int padding_w = paddings[1]; - const int image_size = input_h * input_w; - const int out_image_size = output_h * output_w; - const int valid_h_start = padding_h; - const int valid_h_end = output_h - valid_h_start; - const int valid_h = - valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0; - const int valid_w_start = padding_w; - const int valid_w_end = output_w - valid_w_start; - const int valid_w = valid_w_end - valid_w_start; - - #pragma omp parallel for - for (int g = 0; g < input.dims()[1]; ++g) { - const float *input_ptr = input_data + g * image_size; - const float *filter_ptr = filter_data + g * 9; - float *output_ptr = out_data + g * out_image_size; - - const float *filter_ptr0 = filter_ptr; - const float *filter_ptr1 = filter_ptr0 + 3; - const float *filter_ptr2 = filter_ptr1 + 3; - float32x4_t _ker[3]; - _ker[0] = vld1q_f32(filter_ptr0); - _ker[1] = vld1q_f32(filter_ptr1); - _ker[2] = vld1q_f32(filter_ptr2); - - // pad top - for (int h = 0; h < valid_h_start; ++h) { - DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker); - } - - // output 2x6 - int output_w_tiles = valid_w / 6; - int output_w_remain = valid_w - output_w_tiles * 6; - for (int h = valid_h_start; h < valid_h_end - 1; h += 2) { - const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - const float *input_ptr3 = input_ptr2 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - float *output_ptr1 = output_ptr0 + output_w; - // pad left - if (padding_w) { - float32x4_t row0 = vld1q_f32(input_ptr0); - float32x4_t row1 = vld1q_f32(input_ptr1); - float32x4_t row2 = vld1q_f32(input_ptr2); - float32x4_t row3 = vld1q_f32(input_ptr3); - float32x4_t zero = vdupq_n_f32(0.f); - row0 = vextq_f32(zero, row0, 3); - row1 = vextq_f32(zero, row1, 3); - row2 = vextq_f32(zero, row2, 3); - row3 = vextq_f32(zero, row3, 3); - float32x4_t acc0, acc1; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 3) { - output_ptr0[w] = 0.f; - output_ptr1[w] = 0.f; - } else { - acc0 = vmulq_f32(row0, _ker[0]); - acc0 = vmlaq_f32(acc0, row1, _ker[1]); - acc0 = vmlaq_f32(acc0, row2, _ker[2]); - acc0 = vextq_f32(acc0, acc0, 1); - acc1 = vmulq_f32(row1, _ker[0]); - acc1 = vmlaq_f32(acc1, row2, _ker[1]); - acc1 = vmlaq_f32(acc1, row3, _ker[2]); - acc1 = vextq_f32(acc1, acc1, 1); - float32x2_t sum = vpadd_f32(vget_low_f32(acc0), vget_low_f32(acc1)); - vst1_lane_f32(output_ptr0 + w, sum, 0); - vst1_lane_f32(output_ptr1 + w, sum, 1); - - row0 = vextq_f32(zero, row0, 3); - row1 = vextq_f32(zero, row1, 3); - row2 = vextq_f32(zero, row2, 3); - row3 = vextq_f32(zero, row3, 3); - } - } - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - } - // valid - float32x4_t _result0, _result1, _result2, _result3; - for (int loop = 0; loop < output_w_tiles; ++loop) { - float32x4_t _row00 = vld1q_f32(input_ptr0); - float32x4_t _row01 = vld1q_f32(input_ptr0 + 4); - float32x4_t _row10 = vld1q_f32(input_ptr1); - float32x4_t _row11 = vld1q_f32(input_ptr1 + 4); - - float32x4_t _ext01 = vextq_f32(_row00, _row01, 1); - float32x4_t _ext02 = vextq_f32(_row00, _row01, 2); - float32x4_t _ext03 = vextq_f32(_row01, _row01, 1); - float32x4_t _ext04 = vextq_f32(_row01, _row01, 2); - - _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0); - _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0); - - _ext01 = vextq_f32(_row10, _row11, 1); - _ext02 = vextq_f32(_row10, _row11, 2); - _ext03 = vextq_f32(_row11, _row11, 1); - _ext04 = vextq_f32(_row11, _row11, 2); - - _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0); - _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0); - - _result2 = vmulq_lane_f32(_row10, vget_low_f32(_ker[0]), 0); - _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[0]), 1); - _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[0]), 0); - _result3 = vmulq_lane_f32(_row11, vget_low_f32(_ker[0]), 0); - _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[0]), 1); - _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[0]), 0); - - _row00 = vld1q_f32(input_ptr2); - _row01 = vld1q_f32(input_ptr2 + 4); - _row10 = vld1q_f32(input_ptr3); - _row11 = vld1q_f32(input_ptr3 + 4); - - _ext01 = vextq_f32(_row00, _row01, 1); - _ext02 = vextq_f32(_row00, _row01, 2); - _ext03 = vextq_f32(_row01, _row01, 1); - _ext04 = vextq_f32(_row01, _row01, 2); - - _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0); - _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0); - - _result2 = vmlaq_lane_f32(_result2, _row00, vget_low_f32(_ker[1]), 0); - _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[1]), 1); - _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[1]), 0); - _result3 = vmlaq_lane_f32(_result3, _row01, vget_low_f32(_ker[1]), 0); - _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[1]), 1); - _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[1]), 0); - - _ext01 = vextq_f32(_row10, _row11, 1); - _ext02 = vextq_f32(_row10, _row11, 2); - _ext03 = vextq_f32(_row11, _row11, 1); - _ext04 = vextq_f32(_row11, _row11, 2); - - _result2 = vmlaq_lane_f32(_result2, _row10, vget_low_f32(_ker[2]), 0); - _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[2]), 1); - _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[2]), 0); - _result3 = vmlaq_lane_f32(_result3, _row11, vget_low_f32(_ker[2]), 0); - _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[2]), 1); - _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[2]), 0); - - vst1q_f32(output_ptr0, _result0); - vst1_f32(output_ptr0 + 4, vget_low_f32(_result1)); - vst1q_f32(output_ptr1, _result2); - vst1_f32(output_ptr1 + 4, vget_low_f32(_result3)); - - input_ptr0 += 6; - input_ptr1 += 6; - input_ptr2 += 6; - input_ptr3 += 6; - output_ptr0 += 6; - output_ptr1 += 6; - } - // remain w - if (output_w_remain > 0) { - float32x4_t _row00 = vld1q_f32(input_ptr0); - float32x4_t _row01 = vld1q_f32(input_ptr0 + 4); - float32x4_t _row10 = vld1q_f32(input_ptr1); - float32x4_t _row11 = vld1q_f32(input_ptr1 + 4); - - float32x4_t _ext01 = vextq_f32(_row00, _row01, 1); - float32x4_t _ext02 = vextq_f32(_row00, _row01, 2); - float32x4_t _ext03 = vextq_f32(_row01, _row01, 1); - float32x4_t _ext04 = vextq_f32(_row01, _row01, 2); - - _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0); - _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0); - - _ext01 = vextq_f32(_row10, _row11, 1); - _ext02 = vextq_f32(_row10, _row11, 2); - _ext03 = vextq_f32(_row11, _row11, 1); - _ext04 = vextq_f32(_row11, _row11, 2); - - _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0); - _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0); - - _result2 = vmulq_lane_f32(_row10, vget_low_f32(_ker[0]), 0); - _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[0]), 1); - _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[0]), 0); - _result3 = vmulq_lane_f32(_row11, vget_low_f32(_ker[0]), 0); - _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[0]), 1); - _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[0]), 0); - - _row00 = vld1q_f32(input_ptr2); - _row01 = vld1q_f32(input_ptr2 + 4); - _row10 = vld1q_f32(input_ptr3); - _row11 = vld1q_f32(input_ptr3 + 4); - - _ext01 = vextq_f32(_row00, _row01, 1); - _ext02 = vextq_f32(_row00, _row01, 2); - _ext03 = vextq_f32(_row01, _row01, 1); - _ext04 = vextq_f32(_row01, _row01, 2); - - _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0); - _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0); - - _result2 = vmlaq_lane_f32(_result2, _row00, vget_low_f32(_ker[1]), 0); - _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[1]), 1); - _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[1]), 0); - _result3 = vmlaq_lane_f32(_result3, _row01, vget_low_f32(_ker[1]), 0); - _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[1]), 1); - _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[1]), 0); - - _ext01 = vextq_f32(_row10, _row11, 1); - _ext02 = vextq_f32(_row10, _row11, 2); - _ext03 = vextq_f32(_row11, _row11, 1); - _ext04 = vextq_f32(_row11, _row11, 2); - - _result2 = vmlaq_lane_f32(_result2, _row10, vget_low_f32(_ker[2]), 0); - _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[2]), 1); - _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[2]), 0); - _result3 = vmlaq_lane_f32(_result3, _row11, vget_low_f32(_ker[2]), 0); - _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[2]), 1); - _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[2]), 0); - - switch (output_w_remain) { - case 5: - vst1q_lane_f32(output_ptr0 + 4, _result1, 0); - vst1q_lane_f32(output_ptr1 + 4, _result3, 0); - case 4: - vst1q_f32(output_ptr0, _result0); - vst1q_f32(output_ptr1, _result2); - break; - case 3: - vst1q_lane_f32(output_ptr0 + 2, _result0, 2); - vst1q_lane_f32(output_ptr1 + 2, _result2, 2); - case 2: - vst1_f32(output_ptr0, vget_low_f32(_result0)); - vst1_f32(output_ptr1, vget_low_f32(_result2)); - break; - case 1: - vst1q_lane_f32(output_ptr0, _result0, 0); - vst1q_lane_f32(output_ptr1, _result2, 0); - break; - } - - input_ptr0 += output_w_remain; - input_ptr1 += output_w_remain; - input_ptr2 += output_w_remain; - input_ptr3 += output_w_remain; - output_ptr0 += output_w_remain; - output_ptr1 += output_w_remain; - } - // pad right - if (padding_w) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t row3 = vld1_f32(input_ptr3); - float32x2_t zero = vdup_n_f32(0.f); - float32x2_t acc0, acc1; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0.f; - *output_ptr1 = 0.f; - } else { - acc0 = vmul_f32(row0, vget_low_f32(_ker[0])); - acc0 = vmla_f32(acc0, row1, vget_low_f32(_ker[1])); - acc0 = vmla_f32(acc0, row2, vget_low_f32(_ker[2])); - acc1 = vmul_f32(row1, vget_low_f32(_ker[0])); - acc1 = vmla_f32(acc1, row2, vget_low_f32(_ker[1])); - acc1 = vmla_f32(acc1, row3, vget_low_f32(_ker[2])); - float32x2_t sum = vpadd_f32(acc0, acc1); - vst1_lane_f32(output_ptr0, sum, 0); - vst1_lane_f32(output_ptr1, sum, 1); - row0 = vext_f32(row0, zero, 1); - row1 = vext_f32(row1, zero, 1); - row2 = vext_f32(row2, zero, 1); - row3 = vext_f32(row3, zero, 1); - } - output_ptr0++; - output_ptr1++; - } - } - } - // remain height - int start_h = valid_h_start + (valid_h & 0xfffffffe); - if (start_h < valid_h_end) { - const float *input_ptr0 = input_ptr + (start_h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - float *output_ptr0 = output_ptr + start_h * output_w; - // pad left - if (padding_w) { - float32x4_t row0 = vld1q_f32(input_ptr0); - float32x4_t row1 = vld1q_f32(input_ptr1); - float32x4_t row2 = vld1q_f32(input_ptr2); - float32x4_t zero = vdupq_n_f32(0.f); - row0 = vextq_f32(zero, row0, 3); - row1 = vextq_f32(zero, row1, 3); - row2 = vextq_f32(zero, row2, 3); - float32x4_t acc; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 3) { - output_ptr0[w] = 0.f; - } else { - acc = vmulq_f32(row0, _ker[0]); - acc = vmlaq_f32(acc, row1, _ker[1]); - acc = vmlaq_f32(acc, row2, _ker[2]); - acc = vextq_f32(acc, acc, 1); - float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_low_f32(acc)); - vst1_lane_f32(output_ptr0 + w, sum, 0); - - row0 = vextq_f32(zero, row0, 3); - row1 = vextq_f32(zero, row1, 3); - row2 = vextq_f32(zero, row2, 3); - } - } - output_ptr0 += valid_w_start; - } - // valid - float32x4_t _result0, _result1; - for (int loop = 0; loop < output_w_tiles; ++loop) { - float32x4_t _row00 = vld1q_f32(input_ptr0); - float32x4_t _row01 = vld1q_f32(input_ptr0 + 4); - float32x4_t _row10 = vld1q_f32(input_ptr1); - float32x4_t _row11 = vld1q_f32(input_ptr1 + 4); - - float32x4_t _ext01 = vextq_f32(_row00, _row01, 1); - float32x4_t _ext02 = vextq_f32(_row00, _row01, 2); - float32x4_t _ext03 = vextq_f32(_row01, _row01, 1); - float32x4_t _ext04 = vextq_f32(_row01, _row01, 2); - - _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0); - _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0); - - _ext01 = vextq_f32(_row10, _row11, 1); - _ext02 = vextq_f32(_row10, _row11, 2); - _ext03 = vextq_f32(_row11, _row11, 1); - _ext04 = vextq_f32(_row11, _row11, 2); - - _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0); - _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0); - - _row00 = vld1q_f32(input_ptr2); - _row01 = vld1q_f32(input_ptr2 + 4); - - _ext01 = vextq_f32(_row00, _row01, 1); - _ext02 = vextq_f32(_row00, _row01, 2); - _ext03 = vextq_f32(_row01, _row01, 1); - _ext04 = vextq_f32(_row01, _row01, 2); - - _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0); - _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0); - - vst1q_f32(output_ptr0, _result0); - vst1_f32(output_ptr0 + 4, vget_low_f32(_result1)); - - input_ptr0 += 6; - input_ptr1 += 6; - input_ptr2 += 6; - output_ptr0 += 6; - } - - if (output_w_remain > 0) { - float32x4_t _row00 = vld1q_f32(input_ptr0); - float32x4_t _row01 = vld1q_f32(input_ptr0 + 4); - float32x4_t _row10 = vld1q_f32(input_ptr1); - float32x4_t _row11 = vld1q_f32(input_ptr1 + 4); - - float32x4_t _ext01 = vextq_f32(_row00, _row01, 1); - float32x4_t _ext02 = vextq_f32(_row00, _row01, 2); - float32x4_t _ext03 = vextq_f32(_row01, _row01, 1); - float32x4_t _ext04 = vextq_f32(_row01, _row01, 2); - - _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0); - _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0); - - _ext01 = vextq_f32(_row10, _row11, 1); - _ext02 = vextq_f32(_row10, _row11, 2); - _ext03 = vextq_f32(_row11, _row11, 1); - _ext04 = vextq_f32(_row11, _row11, 2); - - _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0); - _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0); - - _row00 = vld1q_f32(input_ptr2); - _row01 = vld1q_f32(input_ptr2 + 4); - - _ext01 = vextq_f32(_row00, _row01, 1); - _ext02 = vextq_f32(_row00, _row01, 2); - _ext03 = vextq_f32(_row01, _row01, 1); - _ext04 = vextq_f32(_row01, _row01, 2); - - _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0); - _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0); - _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0); - _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0); - - switch (output_w_remain) { - case 5: - vst1q_lane_f32(output_ptr0 + 4, _result1, 0); - case 4: - vst1q_f32(output_ptr0, _result0); - break; - case 3: - vst1q_lane_f32(output_ptr0 + 2, _result0, 2); - case 2: - vst1_f32(output_ptr0, vget_low_f32(_result0)); - break; - case 1: - vst1q_lane_f32(output_ptr0, _result0, 0); - break; - } - - input_ptr0 += output_w_remain; - input_ptr1 += output_w_remain; - input_ptr2 += output_w_remain; - output_ptr0 += output_w_remain; - } - // pad right - if (padding_w) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t zero = vdup_n_f32(0.f); - float32x2_t acc; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0.f; - } else { - acc = vmul_f32(row0, vget_low_f32(_ker[0])); - acc = vmla_f32(acc, row1, vget_low_f32(_ker[1])); - acc = vmla_f32(acc, row2, vget_low_f32(_ker[2])); - float32x2_t sum = vpadd_f32(acc, acc); - vst1_lane_f32(output_ptr0, sum, 0); - row0 = vext_f32(row0, zero, 1); - row1 = vext_f32(row1, zero, 1); - row2 = vext_f32(row2, zero, 1); - } - output_ptr0++; - } - } - } - // pad bottom - for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) { - DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker); - } - } -} - -template <> -void DepthwiseConv3x3S2(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) { - const float *input_data = input.data(); - const float *filter_data = filter.data(); - float *out_data = output->mutable_data(); - - const int input_h = input.dims()[2]; - const int input_w = input.dims()[3]; - const int output_h = output->dims()[2]; - const int output_w = output->dims()[3]; - const int padding_h = paddings[0]; - const int padding_w = paddings[1]; - const int image_size = input_h * input_w; - const int out_image_size = output_h * output_w; - const int valid_h_start = (padding_h + 1) / 2; - const int valid_h_end = - std::max((input_h + padding_h - 1) / 2, valid_h_start); - const int valid_h = - valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0; - const int valid_w_start = (padding_w + 1) / 2; - const int valid_w_end = - std::max((input_w + padding_w - 1) / 2, valid_w_start); - const int valid_w = valid_w_end - valid_w_start; - const int input_w_start = 2 * valid_w_start - padding_w; - - #pragma omp parallel for - for (int g = 0; g < input.dims()[1]; ++g) { - const float *input_ptr = input_data + g * image_size; - const float *filter_ptr = filter_data + g * 9; - float *output_ptr = out_data + g * out_image_size; - - const float *filter_ptr0 = filter_ptr; - const float *filter_ptr1 = filter_ptr0 + 3; - const float *filter_ptr2 = filter_ptr1 + 3; - float32x4_t _ker[3]; - _ker[0] = vld1q_f32(filter_ptr0); - _ker[1] = vld1q_f32(filter_ptr1); - _ker[2] = vld1q_f32(filter_ptr2); - - // pad top - for (int h = 0; h < valid_h_start; ++h) { - DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker); - } - // valid 2x4 - int output_w_tiles = valid_w / 4; - int output_w_remain = valid_w - output_w_tiles * 4; - for (int h = valid_h_start; h < valid_h_end - 1; h += 2) { - const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - const float *input_ptr3 = input_ptr2 + input_w; - const float *input_ptr4 = input_ptr3 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - float *output_ptr1 = output_ptr0 + output_w; - // pad left - if (padding_w) { - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - (w << 1); - if (padding >= 3) { - output_ptr0[w] = 0; - output_ptr1[w] = 0; - } else { - float32x4_t row0 = vld1q_f32(input_ptr0 - padding); - float32x4_t row1 = vld1q_f32(input_ptr1 - padding); - float32x4_t row2 = vld1q_f32(input_ptr2 - padding); - float32x4_t row3 = vld1q_f32(input_ptr3 - padding); - float32x4_t row4 = vld1q_f32(input_ptr4 - padding); - float32x4_t acc0 = vmulq_f32(row0, _ker[0]); - float32x4_t acc1 = vmulq_f32(row2, _ker[0]); - acc0 = vmlaq_f32(acc0, row1, _ker[1]); - acc1 = vmlaq_f32(acc1, row3, _ker[1]); - acc0 = vmlaq_f32(acc0, row2, _ker[2]); - acc1 = vmlaq_f32(acc1, row4, _ker[2]); - float sum0 = vgetq_lane_f32(acc0, 2); - float sum1 = vgetq_lane_f32(acc1, 2); - if (padding == 1) { - sum0 += vgetq_lane_f32(acc0, 1); - sum1 += vgetq_lane_f32(acc1, 1); - } - output_ptr0[w] = sum0; - output_ptr1[w] = sum1; - } - } - input_ptr0 += input_w_start; - input_ptr1 += input_w_start; - input_ptr2 += input_w_start; - input_ptr3 += input_w_start; - input_ptr4 += input_w_start; - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - } - // valid - float32x4_t _result0, _result1, _ext; - for (int loop = 0; loop < output_w_tiles; ++loop) { - float32x4x2_t _row0 = vld2q_f32(input_ptr0); - float32x4x2_t _row1 = vld2q_f32(input_ptr1); - - _ext = vextq_f32(_row0.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3); - _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0); - - _ext = vextq_f32(_row1.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3); - _result0 = - vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0); - - _row0 = vld2q_f32(input_ptr2); - _row1 = vld2q_f32(input_ptr3); - - _ext = vextq_f32(_row0.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3); - _result0 = - vmlaq_lane_f32(_result0, _row0.val[0], vget_low_f32(_ker[2]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[2]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0); - _result1 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0); - _result1 = - vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[0]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[0]), 0); - - _ext = vextq_f32(_row1.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr3[8], _ext, 3); - _result1 = - vmlaq_lane_f32(_result1, _row1.val[0], vget_low_f32(_ker[1]), 0); - _result1 = - vmlaq_lane_f32(_result1, _row1.val[1], vget_low_f32(_ker[1]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[1]), 0); - - _row0 = vld2q_f32(input_ptr4); - - _ext = vextq_f32(_row0.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr4[8], _ext, 3); - _result1 = - vmlaq_lane_f32(_result1, _row0.val[0], vget_low_f32(_ker[2]), 0); - _result1 = - vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[2]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[2]), 0); - - vst1q_f32(output_ptr0, _result0); - vst1q_f32(output_ptr1, _result1); - - input_ptr0 += 8; - input_ptr1 += 8; - input_ptr2 += 8; - input_ptr3 += 8; - input_ptr4 += 8; - output_ptr0 += 4; - output_ptr1 += 4; - } - // remain w - if (output_w_remain > 0) { - float32x4x2_t _row0 = vld2q_f32(input_ptr0); - float32x4x2_t _row1 = vld2q_f32(input_ptr1); - - _ext = vextq_f32(_row0.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3); - _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0); - - _ext = vextq_f32(_row1.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3); - _result0 = - vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0); - - _row0 = vld2q_f32(input_ptr2); - _row1 = vld2q_f32(input_ptr3); - - _ext = vextq_f32(_row0.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3); - _result0 = - vmlaq_lane_f32(_result0, _row0.val[0], vget_low_f32(_ker[2]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[2]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0); - _result1 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0); - _result1 = - vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[0]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[0]), 0); - - _ext = vextq_f32(_row1.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr3[8], _ext, 3); - _result1 = - vmlaq_lane_f32(_result1, _row1.val[0], vget_low_f32(_ker[1]), 0); - _result1 = - vmlaq_lane_f32(_result1, _row1.val[1], vget_low_f32(_ker[1]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[1]), 0); - - _row0 = vld2q_f32(input_ptr4); - - _ext = vextq_f32(_row0.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr4[8], _ext, 3); - _result1 = - vmlaq_lane_f32(_result1, _row0.val[0], vget_low_f32(_ker[2]), 0); - _result1 = - vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[2]), 1); - _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[2]), 0); - - switch (output_w_remain) { - case 3: - vst1q_lane_f32(output_ptr0 + 2, _result0, 2); - vst1q_lane_f32(output_ptr1 + 2, _result1, 2); - case 2: - vst1_f32(output_ptr0, vget_low_f32(_result0)); - vst1_f32(output_ptr1, vget_low_f32(_result1)); - break; - case 1: - vst1q_lane_f32(output_ptr0, _result0, 0); - vst1q_lane_f32(output_ptr1, _result1, 0); - break; - } - input_ptr0 += output_w_remain * 2; - input_ptr1 += output_w_remain * 2; - input_ptr2 += output_w_remain * 2; - input_ptr3 += output_w_remain * 2; - input_ptr4 += output_w_remain * 2; - output_ptr0 += output_w_remain; - output_ptr1 += output_w_remain; - } - // pad right - if (padding_w > 0) { - float32x4_t row0 = vld1q_f32(input_ptr0); - float32x4_t row1 = vld1q_f32(input_ptr1); - float32x4_t row2 = vld1q_f32(input_ptr2); - float32x4_t row3 = vld1q_f32(input_ptr3); - float32x4_t row4 = vld1q_f32(input_ptr4); - float32x4_t acc0, acc1; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = 2 * w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0; - *output_ptr1 = 0; - } else { - acc0 = vmulq_f32(row0, _ker[0]); - acc1 = vmulq_f32(row2, _ker[0]); - acc0 = vmlaq_f32(acc0, row1, _ker[1]); - acc1 = vmlaq_f32(acc1, row3, _ker[1]); - acc0 = vmlaq_f32(acc0, row2, _ker[2]); - acc1 = vmlaq_f32(acc1, row4, _ker[2]); - float sum0 = vgetq_lane_f32(acc0, 0); - float sum1 = vgetq_lane_f32(acc1, 0); - if (padding == 1) { - sum0 += vgetq_lane_f32(acc0, 1); - sum1 += vgetq_lane_f32(acc1, 1); - } - *output_ptr0 = sum0; - *output_ptr1 = sum1; - } - output_ptr0++; - output_ptr1++; - } - } - } - // remain height - int start_h = valid_h_start + (valid_h & 0xfffffffe); - if (start_h < valid_h_end) { - const float *input_ptr0 = input_ptr + (2 * start_h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - float *output_ptr0 = output_ptr + start_h * output_w; - // pad left - if (padding_w) { - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - (w << 1); - if (padding >= 3) { - output_ptr0[w] = 0; - } else { - float32x4_t row0 = vld1q_f32(input_ptr0 - padding); - float32x4_t row1 = vld1q_f32(input_ptr1 - padding); - float32x4_t row2 = vld1q_f32(input_ptr2 - padding); - float32x4_t acc0 = vmulq_f32(row0, _ker[0]); - acc0 = vmlaq_f32(acc0, row1, _ker[1]); - acc0 = vmlaq_f32(acc0, row2, _ker[2]); - float sum0 = vgetq_lane_f32(acc0, 2); - if (padding == 1) { - sum0 += vgetq_lane_f32(acc0, 1); - } - output_ptr0[w] = sum0; - } - } - input_ptr0 += input_w_start; - input_ptr1 += input_w_start; - input_ptr2 += input_w_start; - output_ptr0 += valid_w_start; - } - // valid - float32x4_t _result0, _ext; - for (int loop = 0; loop < output_w_tiles; ++loop) { - float32x4x2_t _row0 = vld2q_f32(input_ptr0); - float32x4x2_t _row1 = vld2q_f32(input_ptr1); - float32x4x2_t _row2 = vld2q_f32(input_ptr2); - - _ext = vextq_f32(_row0.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3); - _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0); - - _ext = vextq_f32(_row1.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3); - _result0 = - vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0); - - _ext = vextq_f32(_row2.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3); - _result0 = - vmlaq_lane_f32(_result0, _row2.val[0], vget_low_f32(_ker[2]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row2.val[1], vget_low_f32(_ker[2]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0); - - vst1q_f32(output_ptr0, _result0); - - input_ptr0 += 8; - input_ptr1 += 8; - input_ptr2 += 8; - output_ptr0 += 4; - } - // remain w - if (output_w_remain > 0) { - float32x4x2_t _row0 = vld2q_f32(input_ptr0); - float32x4x2_t _row1 = vld2q_f32(input_ptr1); - float32x4x2_t _row2 = vld2q_f32(input_ptr2); - - _ext = vextq_f32(_row0.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3); - _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0); - - _ext = vextq_f32(_row1.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3); - _result0 = - vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0); - - _ext = vextq_f32(_row2.val[0], _ext, 1); - _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3); - _result0 = - vmlaq_lane_f32(_result0, _row2.val[0], vget_low_f32(_ker[2]), 0); - _result0 = - vmlaq_lane_f32(_result0, _row2.val[1], vget_low_f32(_ker[2]), 1); - _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0); - - switch (output_w_remain) { - case 3: - vst1q_lane_f32(output_ptr0 + 2, _result0, 2); - case 2: - vst1_f32(output_ptr0, vget_low_f32(_result0)); - break; - case 1: - vst1q_lane_f32(output_ptr0, _result0, 0); - break; - } - input_ptr0 += output_w_remain * 2; - input_ptr1 += output_w_remain * 2; - input_ptr2 += output_w_remain * 2; - output_ptr0 += output_w_remain; - } - // pad right - if (padding_w) { - float32x4_t row0 = vld1q_f32(input_ptr0); - float32x4_t row1 = vld1q_f32(input_ptr1); - float32x4_t row2 = vld1q_f32(input_ptr2); - float32x4_t acc0; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = 2 * w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0; - } else { - acc0 = vmulq_f32(row0, _ker[0]); - acc0 = vmlaq_f32(acc0, row1, _ker[1]); - acc0 = vmlaq_f32(acc0, row2, _ker[2]); - float sum0 = vgetq_lane_f32(acc0, 0); - if (padding == 1) { - sum0 += vgetq_lane_f32(acc0, 1); - } - *output_ptr0 = sum0; - } - output_ptr0++; - } - } - } - // pad bottom - for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) { - DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker); - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // __ARM_NEON__ diff --git a/mobile/src/operators/math/depthwise_conv3x3.h b/mobile/src/operators/math/depthwise_conv3x3.h deleted file mode 100644 index 1f145c4f94..0000000000 --- a/mobile/src/operators/math/depthwise_conv3x3.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -// TODO(hjchen2) need to be implemented -// template -// void DepthwiseConv3x3(const framework::Tensor *input, -// const framework::Tensor *filter, -// const std::vector &strides, -// const std::vector &paddings, -// framework::Tensor *output); - -template -void DepthwiseConv3x3S1(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output); - -template -void DepthwiseConv3x3S2(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output); - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/depthwise_conv3x3_int8.cpp b/mobile/src/operators/math/depthwise_conv3x3_int8.cpp deleted file mode 100644 index e69df3e6be..0000000000 --- a/mobile/src/operators/math/depthwise_conv3x3_int8.cpp +++ /dev/null @@ -1,1660 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#include -#include "operators/math/depthwise_conv3x3.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -#define DEPTHWISE_CONV_NORMAL_BORDER(start, end) \ - for (int w = start; w < end; ++w) { \ - const int w_in_start = -padding_w + w * Stride_w; \ - const int w_in_end = w_in_start + 3; \ - const int w_start = w_in_start > 0 ? w_in_start : 0; \ - const int w_end = w_in_end < input_w ? w_in_end : input_w; \ - int32_t value = 0; \ - for (int h_in = h_start; h_in < h_end; ++h_in) { \ - for (int w_in = w_start; w_in < w_end; ++w_in) { \ - value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \ - input[h_in * input_w + w_in]; \ - } \ - } \ - output_ptr[w] = value; \ - } - -template -inline void Depth3x3NormalRowLoadInput(const int8_t *input, int16x8_t *y) { - y[0] = vmovl_s8(vld1_s8(input)); - y[1] = vextq_s16(y[0], y[0], 1); - y[2] = vextq_s16(y[1], y[1], 1); -} - -template <> -inline void Depth3x3NormalRowLoadInput<2>(const int8_t *input, int16x8_t *y) { - int8x8x2_t x0 = vld2_s8(input); - y[0] = vmovl_s8(x0.val[0]); - y[1] = vmovl_s8(x0.val[1]); - y[2] = vextq_s16(y[0], y[0], 1); -} - -template -inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter, - const int h_output, const int input_h, - const int input_w, const int padding_h, - const int padding_w, const int output_w, - int32_t *output, int16x4_t *ker) { - const int h_in_start = -padding_h + h_output * Stride_h; - const int h_in_end = h_in_start + 3; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int h_end = h_in_end < input_h ? h_in_end : input_h; - - const int valid_w_start = (padding_w + Stride_w - 1) / Stride_w; - const int valid_w_end = (input_w + padding_w - 3) / Stride_w + 1; - int32_t *output_ptr = output + h_output * output_w; - // border left - DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start) - // middle - int output_tiles = (valid_w_end - valid_w_start) / 6; - int remain_start = valid_w_start + output_tiles * 6; - int32x4_t _sum0, _sum1; - int16x8_t _y[3]; - for (int w = 0; w < output_tiles * 6; w += 6) { - _sum0 = veorq_s32(_sum0, _sum0); - _sum1 = veorq_s32(_sum1, _sum1); - int output_offset = valid_w_start + w; - int input_w_offset = output_offset * Stride_w - padding_w; - for (int h_in = h_start; h_in < h_end; ++h_in) { - int index = h_in - h_in_start; - Depth3x3NormalRowLoadInput( - input + h_in * input_w + input_w_offset, _y); - _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[0]), ker[index], 0); - _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[1]), ker[index], 1); - _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[2]), ker[index], 2); - _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[0]), ker[index], 0); - _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[1]), ker[index], 1); - _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[2]), ker[index], 2); - } - vst1q_s32(output_ptr + output_offset, _sum0); - vst1_s32(output_ptr + output_offset + 4, vget_low_s32(_sum1)); - } - for (int w = remain_start; w < valid_w_end; ++w) { - int32_t value = 0; - int input_start = -padding_w + w * Stride_w; - for (int h_in = h_start; h_in < h_end; ++h_in) { - for (int j = 0; j < 3; ++j) { - value += filter[(h_in - h_in_start) * 3 + j] * - input[h_in * input_w + j + input_start]; - } - } - output_ptr[w] = value; - } - // border right - DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w) -} - -template <> -void DepthwiseConv3x3S1(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) { - const int8_t *input_data = input.data(); - const int8_t *filter_data = filter.data(); - int32_t *out_data = output->mutable_data(); - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int padding_h = paddings[0]; - int padding_w = paddings[1]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - int valid_h_start = padding_h; - int valid_h_end = output_h - valid_h_start; - int valid_h = valid_h_end - valid_h_start; - int valid_w_start = padding_w; - int valid_w_end = output_w - valid_w_start; - int valid_w = valid_w_end - valid_w_start; - - #pragma omp parallel for - for (int g = 0; g < input.dims()[1]; ++g) { - const int8_t *input_ptr = input_data + g * image_size; - const int8_t *filter_ptr = filter_data + g * 9; - int32_t *output_ptr = out_data + g * out_image_size; - - const int8_t *filter_ptr0 = filter_ptr; - const int8_t *filter_ptr1 = filter_ptr0 + 3; - const int8_t *filter_ptr2 = filter_ptr1 + 3; - int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0))); - int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1))); - int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2))); - int16x8_t _ker0 = vcombine_s16(_k0, _k1); - int16x8_t _ker1 = vcombine_s16(_k2, _k2); - int16x4_t zero = vdup_n_s16(0); - int16x4_t _ker[3] = {_k0, _k1, _k2}; - // top - for (int h = 0; h < valid_h_start; ++h) { - DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker); - } - // valid - int output_w_tiles = valid_w / 6; - int output_w_remain = valid_w - output_w_tiles * 6; - for (int h = valid_h_start; h < valid_h_end - 3; h += 4) { - const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const int8_t *input_ptr1 = input_ptr0 + input_w; - const int8_t *input_ptr2 = input_ptr1 + input_w; - const int8_t *input_ptr3 = input_ptr2 + input_w; - const int8_t *input_ptr4 = input_ptr3 + input_w; - const int8_t *input_ptr5 = input_ptr4 + input_w; - int32_t *output_ptr0 = output_ptr + h * output_w; - int32_t *output_ptr1 = output_ptr0 + output_w; - int32_t *output_ptr2 = output_ptr1 + output_w; - int32_t *output_ptr3 = output_ptr2 + output_w; - // pad left - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3))); - int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4))); - int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5))); - int32x4_t acc; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 3) { - output_ptr0[w] = 0; - output_ptr1[w] = 0; - output_ptr2[w] = 0; - output_ptr3[w] = 0; - } else { - row0 = vext_s16(zero, row0, 3); - row1 = vext_s16(zero, row1, 3); - row2 = vext_s16(zero, row2, 3); - row3 = vext_s16(zero, row3, 3); - row4 = vext_s16(zero, row4, 3); - row5 = vext_s16(zero, row5, 3); - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2); - acc = vmull_s16(row1, _ker[0]); - acc = vmlal_s16(acc, row2, _ker[1]); - acc = vmlal_s16(acc, row3, _ker[2]); - output_ptr1[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2); - acc = vmull_s16(row2, _ker[0]); - acc = vmlal_s16(acc, row3, _ker[1]); - acc = vmlal_s16(acc, row4, _ker[2]); - output_ptr2[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2); - acc = vmull_s16(row3, _ker[0]); - acc = vmlal_s16(acc, row4, _ker[1]); - acc = vmlal_s16(acc, row5, _ker[2]); - output_ptr3[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2); - } - } - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - output_ptr2 += valid_w_start; - output_ptr3 += valid_w_start; - } -#if __aarch64__ -#else - // valid - int loop = output_w_tiles; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - "mov r0, #6 \n" - // loop 6 width - "loop_4h6w_%=: \n" - "vld1.32 {d9}, [%[input_ptr0]], r0 \n" - "vld1.32 {d10}, [%[input_ptr1]], r0 \n" - "vld1.32 {d11}, [%[input_ptr2]], r0 \n" - "vext.s8 d12, d9, d9, #1 \n" - "vext.s8 d13, d9, d9, #2 \n" - "vmovl.s8 q7, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmull.s16 q10, d14, %e[ker0][0] \n" - "vmlal.s16 q10, d16, %e[ker0][1] \n" - "vmlal.s16 q10, d18, %e[ker0][2] \n" - "vmull.s16 q11, d15, %e[ker0][0] \n" - "vmlal.s16 q11, d17, %e[ker0][1] \n" - "vmlal.s16 q11, d19, %e[ker0][2] \n" - - "vext.s8 d12, d10, d10, #1 \n" - "vext.s8 d13, d10, d10, #2 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %f[ker0][0] \n" - "vmlal.s16 q10, d16, %f[ker0][1] \n" - "vmlal.s16 q10, d18, %f[ker0][2] \n" - "vmlal.s16 q11, d15, %f[ker0][0] \n" - "vmlal.s16 q11, d17, %f[ker0][1] \n" - "vmlal.s16 q11, d19, %f[ker0][2] \n" - - "vmull.s16 q12, d14, %e[ker0][0] \n" - "vmlal.s16 q12, d16, %e[ker0][1] \n" - "vmlal.s16 q12, d18, %e[ker0][2] \n" - "vmull.s16 q13, d15, %e[ker0][0] \n" - "vmlal.s16 q13, d17, %e[ker0][1] \n" - "vmlal.s16 q13, d19, %e[ker0][2] \n" - - "vext.s8 d12, d11, d11, #1 \n" - "vext.s8 d13, d11, d11, #2 \n" - "vmovl.s8 q7, d11 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %e[ker1][0] \n" - "vmlal.s16 q10, d16, %e[ker1][1] \n" - "vmlal.s16 q10, d18, %e[ker1][2] \n" - "vmlal.s16 q11, d15, %e[ker1][0] \n" - "vmlal.s16 q11, d17, %e[ker1][1] \n" - "vmlal.s16 q11, d19, %e[ker1][2] \n" - // store row 0, reuse q10/q11 - "vst1.32 {d20-d22}, [%[output_ptr0]]! \n" - - "vmlal.s16 q12, d14, %f[ker0][0] \n" - "vmlal.s16 q12, d16, %f[ker0][1] \n" - "vmlal.s16 q12, d18, %f[ker0][2] \n" - "vmlal.s16 q13, d15, %f[ker0][0] \n" - "vmlal.s16 q13, d17, %f[ker0][1] \n" - "vmlal.s16 q13, d19, %f[ker0][2] \n" - - "vmull.s16 q14, d14, %e[ker0][0] \n" - "vmlal.s16 q14, d16, %e[ker0][1] \n" - "vmlal.s16 q14, d18, %e[ker0][2] \n" - "vmull.s16 q15, d15, %e[ker0][0] \n" - "vmlal.s16 q15, d17, %e[ker0][1] \n" - "vmlal.s16 q15, d19, %e[ker0][2] \n" - - "vld1.32 {d9}, [%[input_ptr3]], r0 \n" - "vld1.32 {d10}, [%[input_ptr4]], r0 \n" - "vld1.32 {d11}, [%[input_ptr5]], r0 \n" - "vext.s8 d12, d9, d9, #1 \n" - "vext.s8 d13, d9, d9, #2 \n" - "vmovl.s8 q7, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q12, d14, %e[ker1][0] \n" - "vmlal.s16 q12, d16, %e[ker1][1] \n" - "vmlal.s16 q12, d18, %e[ker1][2] \n" - "vmlal.s16 q13, d15, %e[ker1][0] \n" - "vmlal.s16 q13, d17, %e[ker1][1] \n" - "vmlal.s16 q13, d19, %e[ker1][2] \n" - // store row 1 - "vst1.32 {d24-d26}, [%[output_ptr1]]! \n" - - "vmlal.s16 q14, d14, %f[ker0][0] \n" - "vmlal.s16 q14, d16, %f[ker0][1] \n" - "vmlal.s16 q14, d18, %f[ker0][2] \n" - "vmlal.s16 q15, d15, %f[ker0][0] \n" - "vmlal.s16 q15, d17, %f[ker0][1] \n" - "vmlal.s16 q15, d19, %f[ker0][2] \n" - - "vmull.s16 q10, d14, %e[ker0][0] \n" - "vmlal.s16 q10, d16, %e[ker0][1] \n" - "vmlal.s16 q10, d18, %e[ker0][2] \n" - "vmull.s16 q11, d15, %e[ker0][0] \n" - "vmlal.s16 q11, d17, %e[ker0][1] \n" - "vmlal.s16 q11, d19, %e[ker0][2] \n" - - "vext.s8 d12, d10, d10, #1 \n" - "vext.s8 d13, d10, d10, #2 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q14, d14, %e[ker1][0] \n" - "vmlal.s16 q14, d16, %e[ker1][1] \n" - "vmlal.s16 q14, d18, %e[ker1][2] \n" - "vmlal.s16 q15, d15, %e[ker1][0] \n" - "vmlal.s16 q15, d17, %e[ker1][1] \n" - "vmlal.s16 q15, d19, %e[ker1][2] \n" - // store row 2 - "vst1.32 {d28-d30}, [%[output_ptr2]]! \n" - - "vmlal.s16 q10, d14, %f[ker0][0] \n" - "vmlal.s16 q10, d16, %f[ker0][1] \n" - "vmlal.s16 q10, d18, %f[ker0][2] \n" - "vmlal.s16 q11, d15, %f[ker0][0] \n" - "vmlal.s16 q11, d17, %f[ker0][1] \n" - "vmlal.s16 q11, d19, %f[ker0][2] \n" - - "vext.s8 d12, d11, d11, #1 \n" - "vext.s8 d13, d11, d11, #2 \n" - "vmovl.s8 q7, d11 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %e[ker1][0] \n" - "vmlal.s16 q10, d16, %e[ker1][1] \n" - "vmlal.s16 q10, d18, %e[ker1][2] \n" - "vmlal.s16 q11, d15, %e[ker1][0] \n" - "vmlal.s16 q11, d17, %e[ker1][1] \n" - "vmlal.s16 q11, d19, %e[ker1][2] \n" - // store row 3 - "vst1.32 {d20-d22}, [%[output_ptr3]]! \n" - - "subs %[loop], #1 \n" - "bne loop_4h6w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - - "mov r0, %[remain] \n" - "vld1.32 {d9}, [%[input_ptr0]], r0 \n" - "vmovl.s8 q7, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q8, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmull.s16 q10, d14, %e[ker0][0] \n" - "vmlal.s16 q10, d16, %e[ker0][1] \n" - "vmlal.s16 q10, d18, %e[ker0][2] \n" - "vld1.32 {d9}, [%[input_ptr1]], r0 \n" - "vmull.s16 q11, d15, %e[ker0][0] \n" - "vmlal.s16 q11, d17, %e[ker0][1] \n" - "vmlal.s16 q11, d19, %e[ker0][2] \n" - - "vmovl.s8 q7, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q8, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmlal.s16 q10, d14, %f[ker0][0] \n" - "vmlal.s16 q10, d16, %f[ker0][1] \n" - "vmlal.s16 q10, d18, %f[ker0][2] \n" - "vmlal.s16 q11, d15, %f[ker0][0] \n" - "vmlal.s16 q11, d17, %f[ker0][1] \n" - "vmlal.s16 q11, d19, %f[ker0][2] \n" - - "vmull.s16 q12, d14, %e[ker0][0] \n" - "vmlal.s16 q12, d16, %e[ker0][1] \n" - "vmlal.s16 q12, d18, %e[ker0][2] \n" - "vld1.32 {d9}, [%[input_ptr2]], r0 \n" - "vmull.s16 q13, d15, %e[ker0][0] \n" - "vmlal.s16 q13, d17, %e[ker0][1] \n" - "vmlal.s16 q13, d19, %e[ker0][2] \n" - - "vmovl.s8 q7, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q8, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmlal.s16 q10, d14, %e[ker1][0] \n" - "vmlal.s16 q10, d16, %e[ker1][1] \n" - "vmlal.s16 q10, d18, %e[ker1][2] \n" - "vmlal.s16 q11, d15, %e[ker1][0] \n" - "vmlal.s16 q11, d17, %e[ker1][1] \n" - "vmlal.s16 q11, d19, %e[ker1][2] \n" - - "vmlal.s16 q12, d14, %f[ker0][0] \n" - "vmlal.s16 q12, d16, %f[ker0][1] \n" - "vmlal.s16 q12, d18, %f[ker0][2] \n" - "vmlal.s16 q13, d15, %f[ker0][0] \n" - "vmlal.s16 q13, d17, %f[ker0][1] \n" - "vmlal.s16 q13, d19, %f[ker0][2] \n" - - "vmull.s16 q14, d14, %e[ker0][0] \n" - "vmlal.s16 q14, d16, %e[ker0][1] \n" - "vmlal.s16 q14, d18, %e[ker0][2] \n" - "vld1.32 {d9}, [%[input_ptr3]], r0 \n" - "vmull.s16 q15, d15, %e[ker0][0] \n" - "vmlal.s16 q15, d17, %e[ker0][1] \n" - "vmlal.s16 q15, d19, %e[ker0][2] \n" - - "vmovl.s8 q7, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q8, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmlal.s16 q12, d14, %e[ker1][0] \n" - "vmlal.s16 q12, d16, %e[ker1][1] \n" - "vmlal.s16 q12, d18, %e[ker1][2] \n" - "vmlal.s16 q13, d15, %e[ker1][0] \n" - "vmlal.s16 q13, d17, %e[ker1][1] \n" - "vmlal.s16 q13, d19, %e[ker1][2] \n" - - "vmlal.s16 q14, d14, %f[ker0][0] \n" - "vmlal.s16 q14, d16, %f[ker0][1] \n" - "vmlal.s16 q14, d18, %f[ker0][2] \n" - "vmlal.s16 q15, d15, %f[ker0][0] \n" - "vmlal.s16 q15, d17, %f[ker0][1] \n" - "vmlal.s16 q15, d19, %f[ker0][2] \n" - - "vmull.s16 q5, d14, %e[ker0][0] \n" - "vmlal.s16 q5, d16, %e[ker0][1] \n" - "vmlal.s16 q5, d18, %e[ker0][2] \n" - "vld1.32 {d9}, [%[input_ptr4]], r0 \n" - "vmull.s16 q6, d15, %e[ker0][0] \n" - "vmlal.s16 q6, d17, %e[ker0][1] \n" - "vmlal.s16 q6, d19, %e[ker0][2] \n" - - "vmovl.s8 q7, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q8, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmlal.s16 q14, d14, %e[ker1][0] \n" - "vmlal.s16 q14, d16, %e[ker1][1] \n" - "vmlal.s16 q14, d18, %e[ker1][2] \n" - "vmlal.s16 q15, d15, %e[ker1][0] \n" - "vmlal.s16 q15, d17, %e[ker1][1] \n" - "vmlal.s16 q15, d19, %e[ker1][2] \n" - - "vmlal.s16 q5, d14, %f[ker0][0] \n" - "vmlal.s16 q5, d16, %f[ker0][1] \n" - "vmlal.s16 q5, d18, %f[ker0][2] \n" - "vld1.32 {d9}, [%[input_ptr5]], r0 \n" - "vmlal.s16 q6, d15, %f[ker0][0] \n" - "vmlal.s16 q6, d17, %f[ker0][1] \n" - "vmlal.s16 q6, d19, %f[ker0][2] \n" - - "vmovl.s8 q7, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q8, d9 \n" - "vext.s8 d9, d9, d9, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmlal.s16 q5, d14, %e[ker1][0] \n" - "vmlal.s16 q5, d16, %e[ker1][1] \n" - "vmlal.s16 q5, d18, %e[ker1][2] \n" - "vmlal.s16 q6, d15, %e[ker1][0] \n" - "vmlal.s16 q6, d17, %e[ker1][1] \n" - "vmlal.s16 q6, d19, %e[ker1][2] \n" - - "cmp %[remain], #4 \n" - "blt store_4h2w_%= \n" - "vst1.32 {q10}, [%[output_ptr0]]! \n" - "vst1.32 {q12}, [%[output_ptr1]]! \n" - "vst1.32 {q14}, [%[output_ptr2]]! \n" - "vst1.32 {q5}, [%[output_ptr3]]! \n" - "cmp %[remain], #5 \n" - "blt end_%= \n" - "vst1.32 {d22[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d26[0]}, [%[output_ptr1]]! \n" - "vst1.32 {d30[0]}, [%[output_ptr2]]! \n" - "vst1.32 {d12[0]}, [%[output_ptr3]]! \n" - "b end_%= \n" - - "store_4h2w_%=: \n" - "cmp %[remain], #2 \n" - "blt store_4h1w_%= \n" - "vst1.32 {d20}, [%[output_ptr0]]! \n" - "vst1.32 {d24}, [%[output_ptr1]]! \n" - "vst1.32 {d28}, [%[output_ptr2]]! \n" - "vst1.32 {d10}, [%[output_ptr3]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d21[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d25[0]}, [%[output_ptr1]]! \n" - "vst1.32 {d29[0]}, [%[output_ptr2]]! \n" - "vst1.32 {d11[0]}, [%[output_ptr3]]! \n" - "b end_%= \n" - - "store_4h1w_%=: \n" - "cmp %[remain], #1 \n" - "blt end_%= \n" - "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" - "vst1.32 {d28[0]}, [%[output_ptr2]]! \n" - "vst1.32 {d10[0]}, [%[output_ptr3]]! \n" - "end_%=: \n" - : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), - [output_ptr2] "+r"(output_ptr2), [output_ptr3] "+r"(output_ptr3), - [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), - [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), - [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), - [loop] "+r"(loop) - : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15", "r0"); -#endif // __aarch64__ - // pad right - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - 2))); - int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4 - 2))); - int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5 - 2))); - row0 = vext_s16(row0, zero, 2); - row1 = vext_s16(row1, zero, 2); - row2 = vext_s16(row2, zero, 2); - row3 = vext_s16(row3, zero, 2); - row4 = vext_s16(row4, zero, 2); - row5 = vext_s16(row5, zero, 2); - int32x4_t acc; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0; - *output_ptr1 = 0; - *output_ptr2 = 0; - *output_ptr3 = 0; - } else { - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1); - acc = vmull_s16(row1, _ker[0]); - acc = vmlal_s16(acc, row2, _ker[1]); - acc = vmlal_s16(acc, row3, _ker[2]); - *output_ptr1 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1); - acc = vmull_s16(row2, _ker[0]); - acc = vmlal_s16(acc, row3, _ker[1]); - acc = vmlal_s16(acc, row4, _ker[2]); - *output_ptr2 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1); - acc = vmull_s16(row3, _ker[0]); - acc = vmlal_s16(acc, row4, _ker[1]); - acc = vmlal_s16(acc, row5, _ker[2]); - *output_ptr3 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1); - - row0 = vext_s16(row0, zero, 1); - row1 = vext_s16(row1, zero, 1); - row2 = vext_s16(row2, zero, 1); - row3 = vext_s16(row3, zero, 1); - row4 = vext_s16(row4, zero, 1); - row5 = vext_s16(row5, zero, 1); - } - output_ptr0++; - output_ptr1++; - output_ptr2++; - output_ptr3++; - } - } - } - // remain height - int start_h = valid_h_start + (valid_h & 0xFFFFFFFC); - for (int h = start_h; h < valid_h_end - 1; h += 2) { - const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const int8_t *input_ptr1 = input_ptr0 + input_w; - const int8_t *input_ptr2 = input_ptr1 + input_w; - const int8_t *input_ptr3 = input_ptr2 + input_w; - int32_t *output_ptr0 = output_ptr + h * output_w; - int32_t *output_ptr1 = output_ptr0 + output_w; - // pad left - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3))); - int32x4_t acc; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 3) { - output_ptr0[w] = 0; - output_ptr1[w] = 0; - } else { - row0 = vext_s16(zero, row0, 3); - row1 = vext_s16(zero, row1, 3); - row2 = vext_s16(zero, row2, 3); - row3 = vext_s16(zero, row3, 3); - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2); - acc = vmull_s16(row1, _ker[0]); - acc = vmlal_s16(acc, row2, _ker[1]); - acc = vmlal_s16(acc, row3, _ker[2]); - output_ptr1[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2); - } - } - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - } - // valid -#if __aarch64__ -#else - int loop = output_w_tiles; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - "mov r0, #6 \n" - // loop 6 widths - "loop_2h6w_%=: \n" - "vld1.32 {d9}, [%[input_ptr0]], r0 \n" - "vld1.32 {d10}, [%[input_ptr1]], r0 \n" - "vld1.32 {d11}, [%[input_ptr2]], r0 \n" - "vext.s8 d12, d9, d9, #1 \n" - "vext.s8 d13, d9, d9, #2 \n" - "vmovl.s8 q7, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmull.s16 q10, d14, %e[ker0][0] \n" - "vmlal.s16 q10, d16, %e[ker0][1] \n" - "vmlal.s16 q10, d18, %e[ker0][2] \n" - "vmull.s16 q11, d15, %e[ker0][0] \n" - "vmlal.s16 q11, d17, %e[ker0][1] \n" - "vmlal.s16 q11, d19, %e[ker0][2] \n" - - "vext.s8 d12, d10, d10, #1 \n" - "vext.s8 d13, d10, d10, #2 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %f[ker0][0] \n" - "vmlal.s16 q10, d16, %f[ker0][1] \n" - "vmlal.s16 q10, d18, %f[ker0][2] \n" - "vmlal.s16 q11, d15, %f[ker0][0] \n" - "vmlal.s16 q11, d17, %f[ker0][1] \n" - "vmlal.s16 q11, d19, %f[ker0][2] \n" - - "vmull.s16 q12, d14, %e[ker0][0] \n" - "vmlal.s16 q12, d16, %e[ker0][1] \n" - "vmlal.s16 q12, d18, %e[ker0][2] \n" - "vmull.s16 q13, d15, %e[ker0][0] \n" - "vmlal.s16 q13, d17, %e[ker0][1] \n" - "vmlal.s16 q13, d19, %e[ker0][2] \n" - - "vext.s8 d12, d11, d11, #1 \n" - "vext.s8 d13, d11, d11, #2 \n" - "vmovl.s8 q7, d11 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %e[ker1][0] \n" - "vmlal.s16 q10, d16, %e[ker1][1] \n" - "vmlal.s16 q10, d18, %e[ker1][2] \n" - "vmlal.s16 q11, d15, %e[ker1][0] \n" - "vmlal.s16 q11, d17, %e[ker1][1] \n" - "vmlal.s16 q11, d19, %e[ker1][2] \n" - // store row 0, reuse q10/q11 - "vst1.32 {d20-d22}, [%[output_ptr0]]! \n" - - "vmlal.s16 q12, d14, %f[ker0][0] \n" - "vmlal.s16 q12, d16, %f[ker0][1] \n" - "vmlal.s16 q12, d18, %f[ker0][2] \n" - "vmlal.s16 q13, d15, %f[ker0][0] \n" - "vmlal.s16 q13, d17, %f[ker0][1] \n" - "vmlal.s16 q13, d19, %f[ker0][2] \n" - - "vld1.32 {d9}, [%[input_ptr3]], r0 \n" - "vext.s8 d12, d9, d9, #1 \n" - "vext.s8 d13, d9, d9, #2 \n" - "vmovl.s8 q7, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q12, d14, %e[ker1][0] \n" - "vmlal.s16 q12, d16, %e[ker1][1] \n" - "vmlal.s16 q12, d18, %e[ker1][2] \n" - "vmlal.s16 q13, d15, %e[ker1][0] \n" - "vmlal.s16 q13, d17, %e[ker1][1] \n" - "vmlal.s16 q13, d19, %e[ker1][2] \n" - // store row 1 - "vst1.32 {d24-d26}, [%[output_ptr1]]! \n" - - "subs %[loop], #1 \n" - "bne loop_2h6w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - - "mov r0, %[remain] \n" - "vld1.32 {d9}, [%[input_ptr0]], r0 \n" - "vld1.32 {d10}, [%[input_ptr1]], r0 \n" - "vld1.32 {d11}, [%[input_ptr2]], r0 \n" - "vext.s8 d12, d9, d9, #1 \n" - "vext.s8 d13, d9, d9, #2 \n" - "vmovl.s8 q7, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmull.s16 q10, d14, %e[ker0][0] \n" - "vmlal.s16 q10, d16, %e[ker0][1] \n" - "vmlal.s16 q10, d18, %e[ker0][2] \n" - "vmull.s16 q11, d15, %e[ker0][0] \n" - "vmlal.s16 q11, d17, %e[ker0][1] \n" - "vmlal.s16 q11, d19, %e[ker0][2] \n" - - "vext.s8 d12, d10, d10, #1 \n" - "vext.s8 d13, d10, d10, #2 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %f[ker0][0] \n" - "vmlal.s16 q10, d16, %f[ker0][1] \n" - "vmlal.s16 q10, d18, %f[ker0][2] \n" - "vmlal.s16 q11, d15, %f[ker0][0] \n" - "vmlal.s16 q11, d17, %f[ker0][1] \n" - "vmlal.s16 q11, d19, %f[ker0][2] \n" - - "vmull.s16 q12, d14, %e[ker0][0] \n" - "vmlal.s16 q12, d16, %e[ker0][1] \n" - "vmlal.s16 q12, d18, %e[ker0][2] \n" - "vmull.s16 q13, d15, %e[ker0][0] \n" - "vmlal.s16 q13, d17, %e[ker0][1] \n" - "vmlal.s16 q13, d19, %e[ker0][2] \n" - - "vext.s8 d12, d11, d11, #1 \n" - "vext.s8 d13, d11, d11, #2 \n" - "vmovl.s8 q7, d11 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %e[ker1][0] \n" - "vmlal.s16 q10, d16, %e[ker1][1] \n" - "vmlal.s16 q10, d18, %e[ker1][2] \n" - "vmlal.s16 q11, d15, %e[ker1][0] \n" - "vmlal.s16 q11, d17, %e[ker1][1] \n" - "vmlal.s16 q11, d19, %e[ker1][2] \n" - - "vmlal.s16 q12, d14, %f[ker0][0] \n" - "vmlal.s16 q12, d16, %f[ker0][1] \n" - "vmlal.s16 q12, d18, %f[ker0][2] \n" - "vmlal.s16 q13, d15, %f[ker0][0] \n" - "vmlal.s16 q13, d17, %f[ker0][1] \n" - "vmlal.s16 q13, d19, %f[ker0][2] \n" - - "vld1.32 {d9}, [%[input_ptr3]], r0 \n" - "vext.s8 d12, d9, d9, #1 \n" - "vext.s8 d13, d9, d9, #2 \n" - "vmovl.s8 q7, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q12, d14, %e[ker1][0] \n" - "vmlal.s16 q12, d16, %e[ker1][1] \n" - "vmlal.s16 q12, d18, %e[ker1][2] \n" - "vmlal.s16 q13, d15, %e[ker1][0] \n" - "vmlal.s16 q13, d17, %e[ker1][1] \n" - "vmlal.s16 q13, d19, %e[ker1][2] \n" - - "cmp %[remain], #4 \n" - "blt store_2h2w_%= \n" - "vst1.32 {q10}, [%[output_ptr0]]! \n" - "vst1.32 {q12}, [%[output_ptr1]]! \n" - "cmp %[remain], #5 \n" - "blt end_%= \n" - "vst1.32 {d22[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d26[0]}, [%[output_ptr1]]! \n" - "b end_%= \n" - - "store_2h2w_%=: \n" - "cmp %[remain], #2 \n" - "blt store_2h1w_%= \n" - "vst1.32 {d20}, [%[output_ptr0]]! \n" - "vst1.32 {d24}, [%[output_ptr1]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d21[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d25[0]}, [%[output_ptr1]]! \n" - "b end_%= \n" - - "store_2h1w_%=: \n" - "cmp %[remain], #1 \n" - "blt end_%= \n" - "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" - "end_%=: \n" - : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), - [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), - [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), - [loop] "+r"(loop) - : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15", "r0"); -#endif // __aarch64__ - // pad right - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - 2))); - row0 = vext_s16(row0, zero, 2); - row1 = vext_s16(row1, zero, 2); - row2 = vext_s16(row2, zero, 2); - row3 = vext_s16(row3, zero, 2); - int32x4_t acc; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0; - *output_ptr1 = 0; - } else { - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1); - acc = vmull_s16(row1, _ker[0]); - acc = vmlal_s16(acc, row2, _ker[1]); - acc = vmlal_s16(acc, row3, _ker[2]); - *output_ptr1 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1); - - row0 = vext_s16(row0, zero, 1); - row1 = vext_s16(row1, zero, 1); - row2 = vext_s16(row2, zero, 1); - row3 = vext_s16(row3, zero, 1); - } - output_ptr0++; - output_ptr1++; - } - } - } - - start_h = valid_h_start + (valid_h & 0xFFFFFFFE); - if (start_h < valid_h_end) { - const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w; - const int8_t *input_ptr1 = input_ptr0 + input_w; - const int8_t *input_ptr2 = input_ptr1 + input_w; - int32_t *output_ptr0 = output_ptr + start_h * output_w; - // pad left - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int32x4_t acc; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 3) { - output_ptr0[w] = 0; - } else { - row0 = vext_s16(zero, row0, 3); - row1 = vext_s16(zero, row1, 3); - row2 = vext_s16(zero, row2, 3); - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2); - } - } - output_ptr0 += valid_w_start; - } - // valid -#if __aarch64__ -#else - int loop = output_w_tiles; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - "mov r0, #6 \n" - // loop 6 widths - "loop_1h6w_%=: \n" - "vld1.32 {d9}, [%[input_ptr0]], r0 \n" - "vld1.32 {d10}, [%[input_ptr1]], r0 \n" - "vld1.32 {d11}, [%[input_ptr2]], r0 \n" - "vext.s8 d12, d9, d9, #1 \n" - "vext.s8 d13, d9, d9, #2 \n" - "vmovl.s8 q7, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmull.s16 q10, d14, %e[ker0][0] \n" - "vmlal.s16 q10, d16, %e[ker0][1] \n" - "vmlal.s16 q10, d18, %e[ker0][2] \n" - "vmull.s16 q11, d15, %e[ker0][0] \n" - "vmlal.s16 q11, d17, %e[ker0][1] \n" - "vmlal.s16 q11, d19, %e[ker0][2] \n" - - "vext.s8 d12, d10, d10, #1 \n" - "vext.s8 d13, d10, d10, #2 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %f[ker0][0] \n" - "vmlal.s16 q10, d16, %f[ker0][1] \n" - "vmlal.s16 q10, d18, %f[ker0][2] \n" - "vmlal.s16 q11, d15, %f[ker0][0] \n" - "vmlal.s16 q11, d17, %f[ker0][1] \n" - "vmlal.s16 q11, d19, %f[ker0][2] \n" - - "vext.s8 d12, d11, d11, #1 \n" - "vext.s8 d13, d11, d11, #2 \n" - "vmovl.s8 q7, d11 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %e[ker1][0] \n" - "vmlal.s16 q10, d16, %e[ker1][1] \n" - "vmlal.s16 q10, d18, %e[ker1][2] \n" - "vmlal.s16 q11, d15, %e[ker1][0] \n" - "vmlal.s16 q11, d17, %e[ker1][1] \n" - "vmlal.s16 q11, d19, %e[ker1][2] \n" - // store row 0, reuse q10/q11 - "vst1.32 {d20-d22}, [%[output_ptr0]]! \n" - - "subs %[loop], #1 \n" - "bne loop_1h6w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - "mov r0, %[remain] \n" - - "vld1.32 {d9}, [%[input_ptr0]], r0 \n" - "vld1.32 {d10}, [%[input_ptr1]], r0 \n" - "vld1.32 {d11}, [%[input_ptr2]], r0 \n" - "vext.s8 d12, d9, d9, #1 \n" - "vext.s8 d13, d9, d9, #2 \n" - "vmovl.s8 q7, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmull.s16 q10, d14, %e[ker0][0] \n" - "vmlal.s16 q10, d16, %e[ker0][1] \n" - "vmlal.s16 q10, d18, %e[ker0][2] \n" - "vmull.s16 q11, d15, %e[ker0][0] \n" - "vmlal.s16 q11, d17, %e[ker0][1] \n" - "vmlal.s16 q11, d19, %e[ker0][2] \n" - - "vext.s8 d12, d10, d10, #1 \n" - "vext.s8 d13, d10, d10, #2 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %f[ker0][0] \n" - "vmlal.s16 q10, d16, %f[ker0][1] \n" - "vmlal.s16 q10, d18, %f[ker0][2] \n" - "vmlal.s16 q11, d15, %f[ker0][0] \n" - "vmlal.s16 q11, d17, %f[ker0][1] \n" - "vmlal.s16 q11, d19, %f[ker0][2] \n" - - "vext.s8 d12, d11, d11, #1 \n" - "vext.s8 d13, d11, d11, #2 \n" - "vmovl.s8 q7, d11 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q10, d14, %e[ker1][0] \n" - "vmlal.s16 q10, d16, %e[ker1][1] \n" - "vmlal.s16 q10, d18, %e[ker1][2] \n" - "vmlal.s16 q11, d15, %e[ker1][0] \n" - "vmlal.s16 q11, d17, %e[ker1][1] \n" - "vmlal.s16 q11, d19, %e[ker1][2] \n" - - "cmp %[remain], #4 \n" - "blt store_1h2w_%= \n" - "vst1.32 {q10}, [%[output_ptr0]]! \n" - "cmp %[remain], #5 \n" - "blt end_%= \n" - "vst1.32 {d22[0]}, [%[output_ptr0]]! \n" - "b end_%= \n" - - "store_1h2w_%=: \n" - "cmp %[remain], #2 \n" - "blt store_1h1w_%= \n" - "vst1.32 {d20}, [%[output_ptr0]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d21[0]}, [%[output_ptr0]]! \n" - "b end_%= \n" - - "store_1h1w_%=: \n" - "cmp %[remain], #1 \n" - "blt end_%= \n" - "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" - "end_%=: \n" - : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0), - [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), - [loop] "+r"(loop) - : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15", "r0"); -#endif // __aarch64__ - // pad right - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2))); - row0 = vext_s16(row0, zero, 2); - row1 = vext_s16(row1, zero, 2); - row2 = vext_s16(row2, zero, 2); - int32x4_t acc; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0; - } else { - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1); - - row0 = vext_s16(row0, zero, 1); - row1 = vext_s16(row1, zero, 1); - row2 = vext_s16(row2, zero, 1); - } - output_ptr0++; - } - } - } - // bottom - for (int h = valid_h_end; h < output_h; ++h) { - DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker); - } - } -} - -template <> -void DepthwiseConv3x3S2(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) { - const int8_t *input_data = input.data(); - const int8_t *filter_data = filter.data(); - int32_t *out_data = output->mutable_data(); - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int padding_h = paddings[0]; - int padding_w = paddings[1]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - int valid_h_start = (padding_h + 1) / 2; - int valid_h_end = (input_h + padding_h - 1) / 2; - int valid_h = valid_h_end - valid_h_start; - int valid_w_start = (padding_w + 1) / 2; - int valid_w_end = (input_w + padding_w - 1) / 2; - int valid_w = valid_w_end - valid_w_start; - // for pad left - int valid_input_w_start = (valid_w_start << 1) - padding_w; - - // DLOG << "valid_h_start: " << valid_h_start; - // DLOG << "valid_h_end: " << valid_h_end; - // DLOG << "valid_w_start: " << valid_w_start; - // DLOG << "valid_w_end: " << valid_w_end; - - #pragma omp parallel for - for (int g = 0; g < input.dims()[1]; ++g) { - const int8_t *input_ptr = input_data + g * image_size; - const int8_t *filter_ptr = filter_data + g * 9; - int32_t *output_ptr = out_data + g * out_image_size; - - const int8_t *filter_ptr0 = filter_ptr; - const int8_t *filter_ptr1 = filter_ptr0 + 3; - const int8_t *filter_ptr2 = filter_ptr1 + 3; - int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0))); - int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1))); - int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2))); - int16x8_t _ker0 = vcombine_s16(_k0, _k1); - int16x8_t _ker1 = vcombine_s16(_k2, _k2); - int16x4_t _ker[3] = {_k0, _k1, _k2}; - - // top - for (int h = 0; h < valid_h_start; ++h) { - DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker); - } - // valid - int input_w_start = 2 * valid_w_start - padding_w; - int output_w_tiles = valid_w / 6; - int output_w_remain = valid_w - output_w_tiles * 6; - for (int h = valid_h_start; h < valid_h_end - 2; h += 3) { - const int8_t *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w; - const int8_t *input_ptr1 = input_ptr0 + input_w; - const int8_t *input_ptr2 = input_ptr1 + input_w; - const int8_t *input_ptr3 = input_ptr2 + input_w; - const int8_t *input_ptr4 = input_ptr3 + input_w; - const int8_t *input_ptr5 = input_ptr4 + input_w; - const int8_t *input_ptr6 = input_ptr5 + input_w; - int32_t *output_ptr0 = output_ptr + h * output_w; - int32_t *output_ptr1 = output_ptr0 + output_w; - int32_t *output_ptr2 = output_ptr1 + output_w; - // pad left - if (padding_w) { - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - (w << 1); - if (padding >= 3) { - output_ptr0[w] = 0; - output_ptr1[w] = 0; - output_ptr2[w] = 0; - } else { - int16x4_t row0 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - padding))); - int16x4_t row1 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - padding))); - int16x4_t row2 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - padding))); - int16x4_t row3 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - padding))); - int16x4_t row4 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr4 - padding))); - int16x4_t row5 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr5 - padding))); - int16x4_t row6 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr6 - padding))); - int32x4_t acc0 = vmull_s16(row0, _ker[0]); - acc0 = vmlal_s16(acc0, row1, _ker[1]); - acc0 = vmlal_s16(acc0, row2, _ker[2]); - int32x4_t acc1 = vmull_s16(row2, _ker[0]); - acc1 = vmlal_s16(acc1, row3, _ker[1]); - acc1 = vmlal_s16(acc1, row4, _ker[2]); - int32x4_t acc2 = vmull_s16(row4, _ker[0]); - acc2 = vmlal_s16(acc2, row5, _ker[1]); - acc2 = vmlal_s16(acc2, row6, _ker[2]); - int32_t sum0 = vgetq_lane_s32(acc0, 2); - int32_t sum1 = vgetq_lane_s32(acc1, 2); - int32_t sum2 = vgetq_lane_s32(acc2, 2); - if (padding == 1) { - sum0 += vgetq_lane_s32(acc0, 1); - sum1 += vgetq_lane_s32(acc1, 1); - sum2 += vgetq_lane_s32(acc2, 1); - } - output_ptr0[w] = sum0; - output_ptr1[w] = sum1; - output_ptr2[w] = sum2; - } - } - input_ptr0 += valid_input_w_start; - input_ptr1 += valid_input_w_start; - input_ptr2 += valid_input_w_start; - input_ptr3 += valid_input_w_start; - input_ptr4 += valid_input_w_start; - input_ptr5 += valid_input_w_start; - input_ptr6 += valid_input_w_start; - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - output_ptr2 += valid_w_start; - } - // valid -#if __aarch64__ -#else - int loop = output_w_tiles; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - "mov r0, #12 \n" - // loop 6 widths - "loop_3h6w_%=: \n" - "vld2.8 {d10-d11}, [%[input_ptr0]], r0 \n" - "vld2.8 {d12-d13}, [%[input_ptr1]], r0 \n" - "vld2.8 {d14-d15}, [%[input_ptr2]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmull.s16 q11, d16, %e[ker0][0] \n" - "vmlal.s16 q11, d18, %e[ker0][1] \n" - "vmlal.s16 q11, d20, %e[ker0][2] \n" - "vmull.s16 q12, d17, %e[ker0][0] \n" - "vmlal.s16 q12, d19, %e[ker0][1] \n" - "vmlal.s16 q12, d21, %e[ker0][2] \n" - - "vext.s8 d9, d12, d12, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q11, d16, %f[ker0][0] \n" - "vmlal.s16 q11, d18, %f[ker0][1] \n" - "vmlal.s16 q11, d20, %f[ker0][2] \n" - "vmlal.s16 q12, d17, %f[ker0][0] \n" - "vmlal.s16 q12, d19, %f[ker0][1] \n" - "vmlal.s16 q12, d21, %f[ker0][2] \n" - - "vext.s8 d9, d14, d14, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d14 \n" - "vmovl.s8 q9, d15 \n" - "vmlal.s16 q11, d16, %e[ker1][0] \n" - "vmlal.s16 q11, d18, %e[ker1][1] \n" - "vmlal.s16 q11, d20, %e[ker1][2] \n" - "vmlal.s16 q12, d17, %e[ker1][0] \n" - "vmlal.s16 q12, d19, %e[ker1][1] \n" - "vmlal.s16 q12, d21, %e[ker1][2] \n" - // store row 0, reuse q11/q12 - "vst1.32 {d22-d24}, [%[output_ptr0]]! \n" - - "vmull.s16 q13, d16, %e[ker0][0] \n" - "vmlal.s16 q13, d18, %e[ker0][1] \n" - "vmlal.s16 q13, d20, %e[ker0][2] \n" - "vmull.s16 q14, d17, %e[ker0][0] \n" - "vmlal.s16 q14, d19, %e[ker0][1] \n" - "vmlal.s16 q14, d21, %e[ker0][2] \n" - - "vld2.8 {d10-d11}, [%[input_ptr3]], r0 \n" - "vld2.8 {d12-d13}, [%[input_ptr4]], r0 \n" - "vld2.8 {d14-d15}, [%[input_ptr5]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmlal.s16 q13, d16, %f[ker0][0] \n" - "vmlal.s16 q13, d18, %f[ker0][1] \n" - "vmlal.s16 q13, d20, %f[ker0][2] \n" - "vmlal.s16 q14, d17, %f[ker0][0] \n" - "vmlal.s16 q14, d19, %f[ker0][1] \n" - "vmlal.s16 q14, d21, %f[ker0][2] \n" - - "vext.s8 d9, d12, d12, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q13, d16, %e[ker1][0] \n" - "vmlal.s16 q13, d18, %e[ker1][1] \n" - "vmlal.s16 q13, d20, %e[ker1][2] \n" - "vmlal.s16 q14, d17, %e[ker1][0] \n" - "vmlal.s16 q14, d19, %e[ker1][1] \n" - "vmlal.s16 q14, d21, %e[ker1][2] \n" - // store row 1 - "vst1.32 {d26-d28}, [%[output_ptr1]]! \n" - - "vmull.s16 q11, d16, %e[ker0][0] \n" - "vmlal.s16 q11, d18, %e[ker0][1] \n" - "vmlal.s16 q11, d20, %e[ker0][2] \n" - "vmull.s16 q12, d17, %e[ker0][0] \n" - "vmlal.s16 q12, d19, %e[ker0][1] \n" - "vmlal.s16 q12, d21, %e[ker0][2] \n" - - "vext.s8 d9, d14, d14, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d14 \n" - "vmovl.s8 q9, d15 \n" - "vmlal.s16 q11, d16, %f[ker0][0] \n" - "vmlal.s16 q11, d18, %f[ker0][1] \n" - "vmlal.s16 q11, d20, %f[ker0][2] \n" - "vmlal.s16 q12, d17, %f[ker0][0] \n" - "vmlal.s16 q12, d19, %f[ker0][1] \n" - "vmlal.s16 q12, d21, %f[ker0][2] \n" - - "vld2.8 {d10-d11}, [%[input_ptr6]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmlal.s16 q11, d16, %e[ker1][0] \n" - "vmlal.s16 q11, d18, %e[ker1][1] \n" - "vmlal.s16 q11, d20, %e[ker1][2] \n" - "vmlal.s16 q12, d17, %e[ker1][0] \n" - "vmlal.s16 q12, d19, %e[ker1][1] \n" - "vmlal.s16 q12, d21, %e[ker1][2] \n" - // store row 2 - "vst1.32 {d22-d24}, [%[output_ptr2]]! \n" - - "subs %[loop], #1 \n" - "bne loop_3h6w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - "mov r0, %[remain], lsl #1 \n" - - "vld2.8 {d10-d11}, [%[input_ptr0]], r0 \n" - "vld2.8 {d12-d13}, [%[input_ptr1]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d11 \n" - "vmull.s16 q10, d14, %e[ker0][0] \n" - "vmlal.s16 q10, d16, %e[ker0][1] \n" - "vmlal.s16 q10, d18, %e[ker0][2] \n" - "vmull.s16 q11, d15, %e[ker0][0] \n" - "vmlal.s16 q11, d17, %e[ker0][1] \n" - "vmlal.s16 q11, d19, %e[ker0][2] \n" - - "vext.s8 d9, d12, d12, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmovl.s8 q7, d12 \n" - "vmovl.s8 q8, d13 \n" - "vmlal.s16 q10, d14, %f[ker0][0] \n" - "vmlal.s16 q10, d16, %f[ker0][1] \n" - "vmlal.s16 q10, d18, %f[ker0][2] \n" - "vmlal.s16 q11, d15, %f[ker0][0] \n" - "vmlal.s16 q11, d17, %f[ker0][1] \n" - "vmlal.s16 q11, d19, %f[ker0][2] \n" - - "vld2.8 {d10-d11}, [%[input_ptr2]], r0 \n" - "vld2.8 {d12-d13}, [%[input_ptr3]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d11 \n" - "vmlal.s16 q10, d14, %e[ker1][0] \n" - "vmlal.s16 q10, d16, %e[ker1][1] \n" - "vmlal.s16 q10, d18, %e[ker1][2] \n" - "vmlal.s16 q11, d15, %e[ker1][0] \n" - "vmlal.s16 q11, d17, %e[ker1][1] \n" - "vmlal.s16 q11, d19, %e[ker1][2] \n" - - "vmull.s16 q12, d14, %e[ker0][0] \n" - "vmlal.s16 q12, d16, %e[ker0][1] \n" - "vmlal.s16 q12, d18, %e[ker0][2] \n" - "vmull.s16 q13, d15, %e[ker0][0] \n" - "vmlal.s16 q13, d17, %e[ker0][1] \n" - "vmlal.s16 q13, d19, %e[ker0][2] \n" - - "vext.s8 d9, d12, d12, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmovl.s8 q7, d12 \n" - "vmovl.s8 q8, d13 \n" - "vmlal.s16 q12, d14, %f[ker0][0] \n" - "vmlal.s16 q12, d16, %f[ker0][1] \n" - "vmlal.s16 q12, d18, %f[ker0][2] \n" - "vmlal.s16 q13, d15, %f[ker0][0] \n" - "vmlal.s16 q13, d17, %f[ker0][1] \n" - "vmlal.s16 q13, d19, %f[ker0][2] \n" - - "vld2.8 {d10-d11}, [%[input_ptr4]], r0 \n" - "vld2.8 {d12-d13}, [%[input_ptr5]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d11 \n" - "vmlal.s16 q12, d14, %e[ker1][0] \n" - "vmlal.s16 q12, d16, %e[ker1][1] \n" - "vmlal.s16 q12, d18, %e[ker1][2] \n" - "vmlal.s16 q13, d15, %e[ker1][0] \n" - "vmlal.s16 q13, d17, %e[ker1][1] \n" - "vmlal.s16 q13, d19, %e[ker1][2] \n" - - "vmull.s16 q14, d14, %e[ker0][0] \n" - "vmlal.s16 q14, d16, %e[ker0][1] \n" - "vmlal.s16 q14, d18, %e[ker0][2] \n" - "vmull.s16 q15, d15, %e[ker0][0] \n" - "vmlal.s16 q15, d17, %e[ker0][1] \n" - "vmlal.s16 q15, d19, %e[ker0][2] \n" - - "vext.s8 d9, d12, d12, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmovl.s8 q7, d12 \n" - "vmovl.s8 q8, d13 \n" - "vmlal.s16 q14, d14, %f[ker0][0] \n" - "vmlal.s16 q14, d16, %f[ker0][1] \n" - "vmlal.s16 q14, d18, %f[ker0][2] \n" - "vmlal.s16 q15, d15, %f[ker0][0] \n" - "vmlal.s16 q15, d17, %f[ker0][1] \n" - "vmlal.s16 q15, d19, %f[ker0][2] \n" - - "vld2.8 {d10-d11}, [%[input_ptr6]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q9, d9 \n" - "vmovl.s8 q7, d10 \n" - "vmovl.s8 q8, d11 \n" - "vmlal.s16 q14, d14, %e[ker1][0] \n" - "vmlal.s16 q14, d16, %e[ker1][1] \n" - "vmlal.s16 q14, d18, %e[ker1][2] \n" - "vmlal.s16 q15, d15, %e[ker1][0] \n" - "vmlal.s16 q15, d17, %e[ker1][1] \n" - "vmlal.s16 q15, d19, %e[ker1][2] \n" - - "cmp %[remain], #4 \n" - "blt store_3h2w_%= \n" - "vst1.32 {q10}, [%[output_ptr0]]! \n" - "vst1.32 {q12}, [%[output_ptr1]]! \n" - "vst1.32 {q14}, [%[output_ptr2]]! \n" - "cmp %[remain], #5 \n" - "blt end_%= \n" - "vst1.32 {d22[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d26[0]}, [%[output_ptr1]]! \n" - "vst1.32 {d30[0]}, [%[output_ptr2]]! \n" - "b end_%= \n" - - "store_3h2w_%=: \n" - "cmp %[remain], #2 \n" - "blt store_3h1w_%= \n" - "vst1.32 {d20}, [%[output_ptr0]]! \n" - "vst1.32 {d24}, [%[output_ptr1]]! \n" - "vst1.32 {d28}, [%[output_ptr2]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d21[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d25[0]}, [%[output_ptr1]]! \n" - "vst1.32 {d29[0]}, [%[output_ptr2]]! \n" - "b end_%= \n" - - "store_3h1w_%=: \n" - "cmp %[remain], #1 \n" - "blt end_%= \n" - "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" - "vst1.32 {d28[0]}, [%[output_ptr2]]! \n" - "end_%=: \n" - : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), - [output_ptr2] "+r"(output_ptr2), [input_ptr6] "+r"(input_ptr6), - [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), - [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), - [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), - [loop] "+r"(loop) - : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15", "r0"); -#endif // __aarch64__ - // pad right - if (padding_w > 0) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3))); - int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4))); - int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5))); - int16x4_t row6 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr6))); - int32x4_t acc0, acc1, acc2; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = 2 * w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0; - *output_ptr1 = 0; - *output_ptr2 = 0; - } else { - acc0 = vmull_s16(row0, _ker[0]); - acc0 = vmlal_s16(acc0, row1, _ker[1]); - acc0 = vmlal_s16(acc0, row2, _ker[2]); - acc1 = vmull_s16(row2, _ker[0]); - acc1 = vmlal_s16(acc1, row3, _ker[1]); - acc1 = vmlal_s16(acc1, row4, _ker[2]); - acc2 = vmull_s16(row4, _ker[0]); - acc2 = vmlal_s16(acc2, row5, _ker[1]); - acc2 = vmlal_s16(acc2, row6, _ker[2]); - int32_t sum0 = vgetq_lane_s32(acc0, 0); - int32_t sum1 = vgetq_lane_s32(acc1, 0); - int32_t sum2 = vgetq_lane_s32(acc2, 0); - if (padding == 1) { - sum0 += vgetq_lane_s32(acc0, 1); - sum1 += vgetq_lane_s32(acc1, 1); - sum2 += vgetq_lane_s32(acc2, 1); - } - *output_ptr0 = sum0; - *output_ptr1 = sum1; - *output_ptr2 = sum2; - } - output_ptr0++; - output_ptr1++; - output_ptr2++; - } - } - } - // remain height - int start_h = valid_h_start + valid_h / 3 * 3; - for (int h = start_h; h < valid_h_end; ++h) { - const int8_t *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w; - const int8_t *input_ptr1 = input_ptr0 + input_w; - const int8_t *input_ptr2 = input_ptr1 + input_w; - int32_t *output_ptr0 = output_ptr + h * output_w; - // pad left - if (padding_w) { - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - (w << 1); - if (padding >= 3) { - output_ptr0[w] = 0; - } else { - int16x4_t row0 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - padding))); - int16x4_t row1 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - padding))); - int16x4_t row2 = - vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - padding))); - int32x4_t acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - int32_t sum0 = vgetq_lane_s32(acc, 2); - if (padding == 1) { - sum0 += vgetq_lane_s32(acc, 1); - } - output_ptr0[w] = sum0; - } - } - input_ptr0 += valid_input_w_start; - input_ptr1 += valid_input_w_start; - input_ptr2 += valid_input_w_start; - output_ptr0 += valid_w_start; - } - // valid -#if __aarch64__ -#else - int loop = output_w_tiles; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - "mov r0, #12 \n" - // loop 6 widths - "loop_1h6w_%=: \n" - "vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n" - "vld2.8 {d12, d13}, [%[input_ptr1]], r0 \n" - "vld2.8 {d14, d15}, [%[input_ptr2]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmull.s16 q11, d16, %e[ker0][0] \n" - "vmlal.s16 q11, d18, %e[ker0][1] \n" - "vmlal.s16 q11, d20, %e[ker0][2] \n" - "vmull.s16 q12, d17, %e[ker0][0] \n" - "vmlal.s16 q12, d19, %e[ker0][1] \n" - "vmlal.s16 q12, d21, %e[ker0][2] \n" - - "vext.s8 d9, d12, d12, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q11, d16, %f[ker0][0] \n" - "vmlal.s16 q11, d18, %f[ker0][1] \n" - "vmlal.s16 q11, d20, %f[ker0][2] \n" - "vmlal.s16 q12, d17, %f[ker0][0] \n" - "vmlal.s16 q12, d19, %f[ker0][1] \n" - "vmlal.s16 q12, d21, %f[ker0][2] \n" - - "vext.s8 d9, d14, d14, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d14 \n" - "vmovl.s8 q9, d15 \n" - "vmlal.s16 q11, d16, %e[ker1][0] \n" - "vmlal.s16 q11, d18, %e[ker1][1] \n" - "vmlal.s16 q11, d20, %e[ker1][2] \n" - "vmlal.s16 q12, d17, %e[ker1][0] \n" - "vmlal.s16 q12, d19, %e[ker1][1] \n" - "vmlal.s16 q12, d21, %e[ker1][2] \n" - // store row 0 - "vst1.32 {d22-d24}, [%[output_ptr0]]! \n" - - "subs %[loop], #1 \n" - "bne loop_1h6w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - "mov r0, %[remain], lsl #1 \n" - - "vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n" - "vld2.8 {d12, d13}, [%[input_ptr1]], r0 \n" - "vld2.8 {d14, d15}, [%[input_ptr2]], r0 \n" - "vext.s8 d9, d10, d10, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmull.s16 q11, d16, %e[ker0][0] \n" - "vmlal.s16 q11, d18, %e[ker0][1] \n" - "vmlal.s16 q11, d20, %e[ker0][2] \n" - "vmull.s16 q12, d17, %e[ker0][0] \n" - "vmlal.s16 q12, d19, %e[ker0][1] \n" - "vmlal.s16 q12, d21, %e[ker0][2] \n" - - "vext.s8 d9, d12, d12, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q11, d16, %f[ker0][0] \n" - "vmlal.s16 q11, d18, %f[ker0][1] \n" - "vmlal.s16 q11, d20, %f[ker0][2] \n" - "vmlal.s16 q12, d17, %f[ker0][0] \n" - "vmlal.s16 q12, d19, %f[ker0][1] \n" - "vmlal.s16 q12, d21, %f[ker0][2] \n" - - "vext.s8 d9, d14, d14, #1 \n" - "vmovl.s8 q10, d9 \n" - "vmovl.s8 q8, d14 \n" - "vmovl.s8 q9, d15 \n" - "vmlal.s16 q11, d16, %e[ker1][0] \n" - "vmlal.s16 q11, d18, %e[ker1][1] \n" - "vmlal.s16 q11, d20, %e[ker1][2] \n" - "vmlal.s16 q12, d17, %e[ker1][0] \n" - "vmlal.s16 q12, d19, %e[ker1][1] \n" - "vmlal.s16 q12, d21, %e[ker1][2] \n" - - "cmp %[remain], #4 \n" - "blt store_1h2w_%= \n" - "vst1.32 {q11}, [%[output_ptr0]]! \n" - "cmp %[remain], #5 \n" - "blt end_%= \n" - "vst1.32 {d24[0]}, [%[output_ptr0]]! \n" - "b end_%= \n" - - "store_1h2w_%=: \n" - "cmp %[remain], #2 \n" - "blt store_1h1w_%= \n" - "vst1.32 {d22}, [%[output_ptr0]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d23[0]}, [%[output_ptr0]]! \n" - "b end_%= \n" - - "store_1h1w_%=: \n" - "cmp %[remain], #1 \n" - "blt end_%= \n" - "vst1.32 {d22[0]}, [%[output_ptr0]]! \n" - "end_%=: \n" - : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0), - [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), - [loop] "+r"(loop) - : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15", "r0"); -#endif // __aarch64__ - // pad right - if (padding_w > 0) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int32x4_t acc; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = 2 * w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0; - } else { - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - int32_t sum0 = vgetq_lane_s32(acc, 0); - if (padding == 1) { - sum0 += vgetq_lane_s32(acc, 1); - } - *output_ptr0 = sum0; - } - output_ptr0++; - } - } - } - // bottom - for (int h = valid_h_end; h < output_h; ++h) { - DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker); - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // __ARM_NEON__ diff --git a/mobile/src/operators/math/depthwise_conv5x5.cpp b/mobile/src/operators/math/depthwise_conv5x5.cpp deleted file mode 100644 index a721cce71e..0000000000 --- a/mobile/src/operators/math/depthwise_conv5x5.cpp +++ /dev/null @@ -1,1106 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#include "operators/math/depthwise_conv5x5.h" -#include -#include - -namespace paddle_mobile { -namespace operators { -namespace math { - -#ifndef __aarch64__ -inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) { - float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0)); - float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1)); - return vcombine_f32(sum0, sum1); -} -#endif - -template -inline void Depth5x5NormalRowLoadInput(const float *input, float32x4_t *y) { - y[0] = vld1q_f32(input); - y[4] = vld1q_f32(input + 4); - y[1] = vextq_f32(y[0], y[4], 1); - y[2] = vextq_f32(y[0], y[4], 2); - y[3] = vextq_f32(y[0], y[4], 3); -} - -template <> -inline void Depth5x5NormalRowLoadInput<2>(const float *input, float32x4_t *y) { - float32x4x2_t x = vld2q_f32(input); - y[0] = x.val[0]; - y[1] = x.val[1]; - y[2] = vextq_f32(y[0], y[0], 1); - y[3] = vextq_f32(y[1], y[1], 1); - y[4] = vextq_f32(y[0], y[0], 2); -} - -#define DEPTHWISE_CONV5X5_NORMAL_BORDER(start, end) \ - for (int w = start; w < end; ++w) { \ - const int w_in_start = -padding_w + w * Stride_w; \ - const int w_in_end = w_in_start + 5; \ - const int w_start = w_in_start > 0 ? w_in_start : 0; \ - const int w_end = w_in_end < input_w ? w_in_end : input_w; \ - float value = 0; \ - for (int h_in = h_start; h_in < h_end; ++h_in) { \ - for (int w_in = w_start; w_in < w_end; ++w_in) { \ - value += filter[(h_in - h_in_start) * 5 + (w_in - w_in_start)] * \ - input[h_in * input_w + w_in]; \ - } \ - } \ - output_ptr[w] = value; \ - } - -template -inline void DepthwiseConv5x5NormalRow(const float *input, const float *filter, - const int h_output, const int input_h, - const int input_w, const int padding_h, - const int padding_w, const int output_w, - float *output, float32x4_t *ker, - float32_t *ker1) { - const int h_in_start = -padding_h + h_output * Stride_h; - const int h_in_end = h_in_start + 5; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int h_end = h_in_end < input_h ? h_in_end : input_h; - - int valid_w_start = (padding_w + Stride_w - 1) / Stride_w; - int valid_w_end = (input_w + padding_w - 5) / Stride_w + 1; - if (valid_w_end < valid_w_start) { - valid_w_end = valid_w_start; - } - float *output_ptr = output + h_output * output_w; - - // border left - DEPTHWISE_CONV5X5_NORMAL_BORDER(0, valid_w_start) - // middle - int output_tiles = (valid_w_end - valid_w_start) >> 2; - float32x4_t _sum, _x[5]; - // valid w - for (int w = 0; w < output_tiles * 4; w += 4) { - _sum = vdupq_n_f32(0.f); - int output_offset = valid_w_start + w; - int input_w_offset = output_offset * Stride_w - padding_w; - for (int h_in = h_start; h_in < h_end; ++h_in) { - int index = h_in - h_in_start; - Depth5x5NormalRowLoadInput( - input + h_in * input_w + input_w_offset, _x); - _sum = vmlaq_n_f32(_sum, _x[0], ker1[index]); - _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 0); - _sum = vmlaq_lane_f32(_sum, _x[2], vget_low_f32(ker[index]), 1); - _sum = vmlaq_lane_f32(_sum, _x[3], vget_high_f32(ker[index]), 0); - _sum = vmlaq_lane_f32(_sum, _x[4], vget_high_f32(ker[index]), 1); - } - vst1q_f32(output_ptr + output_offset, _sum); - } - // remain valid w - int remain = (valid_w_end - valid_w_start) & 0x3; - if (remain > 0) { - _sum = vdupq_n_f32(0.f); - int remain_start = valid_w_start + (output_tiles << 2); - int input_w_offset = remain_start * Stride_w - padding_w; - float *output_ptr0 = output_ptr + remain_start; - - for (int h_in = h_start; h_in < h_end; ++h_in) { - int index = h_in - h_in_start; - Depth5x5NormalRowLoadInput( - input + h_in * input_w + input_w_offset, _x); - _sum = vmlaq_n_f32(_sum, _x[0], ker1[index]); - _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 0); - _sum = vmlaq_lane_f32(_sum, _x[2], vget_low_f32(ker[index]), 1); - _sum = vmlaq_lane_f32(_sum, _x[3], vget_high_f32(ker[index]), 0); - _sum = vmlaq_lane_f32(_sum, _x[4], vget_high_f32(ker[index]), 1); - } - switch (remain) { - case 3: - vst1q_lane_f32(output_ptr0 + 2, _sum, 2); - case 2: - vst1_f32(output_ptr0, vget_low_f32(_sum)); - break; - case 1: - vst1q_lane_f32(output_ptr0, _sum, 0); - break; - } - } - // border right - DEPTHWISE_CONV5X5_NORMAL_BORDER(valid_w_end, output_w) -} - -template <> -void DepthwiseConv5x5S1(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) { - const float *input_data = input.data(); - const float *filter_data = filter.data(); - float *out_data = output->mutable_data(); - - const int input_h = input.dims()[2]; - const int input_w = input.dims()[3]; - const int output_h = output->dims()[2]; - const int output_w = output->dims()[3]; - const int padding_h = paddings[0]; - const int padding_w = paddings[1]; - const int image_size = input_h * input_w; - const int out_image_size = output_h * output_w; - const int valid_h_start = padding_h; - const int valid_h_end = output_h - valid_h_start; - const int valid_h = valid_h_end - valid_h_start; - const int valid_w_start = padding_w; - const int valid_w_end = output_w - valid_w_start; - const int valid_w = valid_w_end - valid_w_start; - - #pragma omp parallel for - for (int g = 0; g < output->dims()[1]; ++g) { - const float *input_ptr = input_data + g * image_size; - const float *filter_ptr = filter_data + g * 25; - float *output_ptr = out_data + g * out_image_size; - - const float *filter_ptr0 = filter_ptr; - const float *filter_ptr1 = filter_ptr0 + 5; - const float *filter_ptr2 = filter_ptr1 + 5; - const float *filter_ptr3 = filter_ptr2 + 5; - const float *filter_ptr4 = filter_ptr3 + 5; - float32x4_t _ker[7]; - float32_t _ker1[5] = {*filter_ptr0, *filter_ptr1, *filter_ptr2, - *filter_ptr3, *filter_ptr4}; - _ker[0] = vld1q_f32(filter_ptr0 + 1); - _ker[1] = vld1q_f32(filter_ptr1 + 1); - _ker[2] = vld1q_f32(filter_ptr2 + 1); - _ker[3] = vld1q_f32(filter_ptr3 + 1); - _ker[4] = vld1q_f32(filter_ptr4 + 1); - _ker[5] = vld1q_f32(_ker1); - _ker[6] = vld1q_f32(_ker1 + 4); - - // pad top - for (int h = 0; h < valid_h_start; ++h) { - DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker, _ker1); - } - - // output 4x4 - int output_w_tiles = valid_w / 4; - int output_w_remain = valid_w - output_w_tiles * 4; - for (int h = valid_h_start; h < valid_h_end - 1; h += 2) { - const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - const float *input_ptr3 = input_ptr2 + input_w; - const float *input_ptr4 = input_ptr3 + input_w; - const float *input_ptr5 = input_ptr4 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - float *output_ptr1 = output_ptr0 + output_w; - // pad left - if (padding_w) { - float32x4_t row0 = vld1q_f32(input_ptr0); - float32x4_t row1 = vld1q_f32(input_ptr1); - float32x4_t row2 = vld1q_f32(input_ptr2); - float32x4_t row3 = vld1q_f32(input_ptr3); - float32x4_t row4 = vld1q_f32(input_ptr4); - float32x4_t row5 = vld1q_f32(input_ptr5); - float32x4_t zero = vdupq_n_f32(0.f); - float32x4_t acc0, acc1; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 5) { - output_ptr0[w] = 0.f; - output_ptr1[w] = 0.f; - } else { - acc0 = vmulq_f32(row0, _ker[0]); - acc0 = vmlaq_f32(acc0, row1, _ker[1]); - acc0 = vmlaq_f32(acc0, row2, _ker[2]); - acc0 = vmlaq_f32(acc0, row3, _ker[3]); - acc0 = vmlaq_f32(acc0, row4, _ker[4]); - acc1 = vmulq_f32(row1, _ker[0]); - acc1 = vmlaq_f32(acc1, row2, _ker[1]); - acc1 = vmlaq_f32(acc1, row3, _ker[2]); - acc1 = vmlaq_f32(acc1, row4, _ker[3]); - acc1 = vmlaq_f32(acc1, row5, _ker[4]); - acc0 = vpaddq_f32(acc0, acc1); - float32x2_t sum = - vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); - vst1_lane_f32(output_ptr0 + w, sum, 0); - vst1_lane_f32(output_ptr1 + w, sum, 1); - - row0 = vextq_f32(zero, row0, 3); - row1 = vextq_f32(zero, row1, 3); - row2 = vextq_f32(zero, row2, 3); - row3 = vextq_f32(zero, row3, 3); - row4 = vextq_f32(zero, row4, 3); - row5 = vextq_f32(zero, row5, 3); - } - } - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - } - // valid -#if __aarch64__ - float32x4_t _q14, _q15; - for (int loop = 0; loop < output_w_tiles; ++loop) { - float32x4_t _q7 = vld1q_f32(input_ptr0); - float32x4_t _q8 = vld1q_f32(input_ptr0 + 4); - float32x4_t _q9 = vld1q_f32(input_ptr1); - float32x4_t _q10 = vld1q_f32(input_ptr1 + 4); - float32x4_t _q11 = vld1q_f32(input_ptr2); - float32x4_t _q12 = vld1q_f32(input_ptr2 + 4); - - _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0); - float32x4_t _q13 = vextq_f32(_q7, _q8, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0); - _q13 = vextq_f32(_q7, _q8, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1); - _q13 = vextq_f32(_q7, _q8, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0); - _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1); - _q15 = vmulq_lane_f32(_q9, vget_low_f32(_ker[5]), 0); - _q13 = vextq_f32(_q9, _q10, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 0); - _q13 = vextq_f32(_q9, _q10, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 1); - _q13 = vextq_f32(_q9, _q10, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[0]), 0); - _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1); - _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[0]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0); - _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[5]), 1); - _q13 = vextq_f32(_q11, _q12, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 0); - _q13 = vextq_f32(_q11, _q12, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 1); - _q13 = vextq_f32(_q11, _q12, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[1]), 0); - _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1); - _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[1]), 1); - - _q7 = vld1q_f32(input_ptr3); - _q8 = vld1q_f32(input_ptr3 + 4); - _q9 = vld1q_f32(input_ptr4); - _q10 = vld1q_f32(input_ptr4 + 4); - _q11 = vld1q_f32(input_ptr5); - _q12 = vld1q_f32(input_ptr5 + 4); - - _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1); - _q15 = vmlaq_lane_f32(_q15, _q7, vget_high_f32(_ker[5]), 0); - _q13 = vextq_f32(_q7, _q8, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 0); - _q13 = vextq_f32(_q7, _q8, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 1); - _q13 = vextq_f32(_q7, _q8, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[2]), 0); - _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1); - _q15 = vmlaq_lane_f32(_q15, _q8, vget_high_f32(_ker[2]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0); - _q15 = vmlaq_lane_f32(_q15, _q9, vget_high_f32(_ker[5]), 1); - _q13 = vextq_f32(_q9, _q10, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 0); - _q13 = vextq_f32(_q9, _q10, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 1); - _q13 = vextq_f32(_q9, _q10, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[3]), 0); - _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1); - _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[3]), 1); - - _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[6]), 0); - _q13 = vextq_f32(_q11, _q12, 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 0); - _q13 = vextq_f32(_q11, _q12, 2); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 1); - _q13 = vextq_f32(_q11, _q12, 3); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[4]), 0); - _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[4]), 1); - - vst1q_f32(output_ptr0, _q14); - vst1q_f32(output_ptr1, _q15); - - input_ptr0 += 4; - input_ptr1 += 4; - input_ptr2 += 4; - input_ptr3 += 4; - input_ptr4 += 4; - input_ptr5 += 4; - output_ptr0 += 4; - output_ptr1 += 4; - } - // remain w - if (output_w_remain > 0) { - float32x4_t _q7 = vld1q_f32(input_ptr0); - float32x4_t _q8 = vld1q_f32(input_ptr0 + 4); - float32x4_t _q9 = vld1q_f32(input_ptr1); - float32x4_t _q10 = vld1q_f32(input_ptr1 + 4); - float32x4_t _q11 = vld1q_f32(input_ptr2); - float32x4_t _q12 = vld1q_f32(input_ptr2 + 4); - - _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0); - float32x4_t _q13 = vextq_f32(_q7, _q8, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0); - _q13 = vextq_f32(_q7, _q8, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1); - _q13 = vextq_f32(_q7, _q8, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0); - _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1); - _q15 = vmulq_lane_f32(_q9, vget_low_f32(_ker[5]), 0); - _q13 = vextq_f32(_q9, _q10, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 0); - _q13 = vextq_f32(_q9, _q10, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 1); - _q13 = vextq_f32(_q9, _q10, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[0]), 0); - _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1); - _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[0]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0); - _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[5]), 1); - _q13 = vextq_f32(_q11, _q12, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 0); - _q13 = vextq_f32(_q11, _q12, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 1); - _q13 = vextq_f32(_q11, _q12, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[1]), 0); - _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1); - _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[1]), 1); - - _q7 = vld1q_f32(input_ptr3); - _q8 = vld1q_f32(input_ptr3 + 4); - _q9 = vld1q_f32(input_ptr4); - _q10 = vld1q_f32(input_ptr4 + 4); - _q11 = vld1q_f32(input_ptr5); - _q12 = vld1q_f32(input_ptr5 + 4); - - _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1); - _q15 = vmlaq_lane_f32(_q15, _q7, vget_high_f32(_ker[5]), 0); - _q13 = vextq_f32(_q7, _q8, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 0); - _q13 = vextq_f32(_q7, _q8, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 1); - _q13 = vextq_f32(_q7, _q8, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[2]), 0); - _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1); - _q15 = vmlaq_lane_f32(_q15, _q8, vget_high_f32(_ker[2]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0); - _q15 = vmlaq_lane_f32(_q15, _q9, vget_high_f32(_ker[5]), 1); - _q13 = vextq_f32(_q9, _q10, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 0); - _q13 = vextq_f32(_q9, _q10, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 1); - _q13 = vextq_f32(_q9, _q10, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[3]), 0); - _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1); - _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[3]), 1); - - _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[6]), 0); - _q13 = vextq_f32(_q11, _q12, 1); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 0); - _q13 = vextq_f32(_q11, _q12, 2); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 1); - _q13 = vextq_f32(_q11, _q12, 3); - _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[4]), 0); - _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[4]), 1); - - switch (output_w_remain) { - case 3: - vst1q_lane_f32(output_ptr0 + 2, _q14, 2); - vst1q_lane_f32(output_ptr1 + 2, _q15, 2); - case 2: - vst1_f32(output_ptr0, vget_low_f32(_q14)); - vst1_f32(output_ptr1, vget_low_f32(_q15)); - break; - case 1: - vst1q_lane_f32(output_ptr0, _q14, 0); - vst1q_lane_f32(output_ptr1, _q15, 0); - break; - } - input_ptr0 += output_w_remain; - input_ptr1 += output_w_remain; - input_ptr2 += output_w_remain; - input_ptr3 += output_w_remain; - input_ptr4 += output_w_remain; - input_ptr5 += output_w_remain; - output_ptr0 += output_w_remain; - output_ptr1 += output_w_remain; - } -#else - int loop = output_w_tiles; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - "mov r0, #16 \n" - "loop_2h4w_%=: \n" - "vld1.32 {d14-d17}, [%[input_ptr0]], r0 \n" - "vld1.32 {d18-d21}, [%[input_ptr1]], r0 \n" - "vld1.32 {d22-d25}, [%[input_ptr2]], r0 \n" - "vmul.f32 q14, q7, %e[ker0][0] \n" - "vext.32 q13, q7, q8, #1 \n" - "vmla.f32 q14, q13, %e[kr0][0] \n" - "vext.32 q13, q7, q8, #2 \n" - "vmla.f32 q14, q13, %e[kr0][1] \n" - "vext.32 q13, q7, q8, #3 \n" - "vmla.f32 q14, q13, %f[kr0][0] \n" - "vmla.f32 q14, q8, %f[kr0][1] \n" - - "vmla.f32 q14, q9, %e[ker0][1] \n" - "vmul.f32 q15, q9, %e[ker0][0] \n" - "vext.32 q13, q9, q10, #1 \n" - "vmla.f32 q14, q13, %e[kr1][0] \n" - "vmla.f32 q15, q13, %e[kr0][0] \n" - "vext.32 q13, q9, q10, #2 \n" - "vmla.f32 q14, q13, %e[kr1][1] \n" - "vmla.f32 q15, q13, %e[kr0][1] \n" - "vext.32 q13, q9, q10, #3 \n" - "vmla.f32 q14, q13, %f[kr1][0] \n" - "vmla.f32 q15, q13, %f[kr0][0] \n" - "vmla.f32 q14, q10, %f[kr1][1] \n" - "vmla.f32 q15, q10, %f[kr0][1] \n" - - "vmla.f32 q14, q11, %f[ker0][0] \n" - "vmla.f32 q15, q11, %e[ker0][1] \n" - "vext.32 q13, q11, q12, #1 \n" - "vmla.f32 q14, q13, %e[kr2][0] \n" - "vmla.f32 q15, q13, %e[kr1][0] \n" - "vext.32 q13, q11, q12, #2 \n" - "vmla.f32 q14, q13, %e[kr2][1] \n" - "vmla.f32 q15, q13, %e[kr1][1] \n" - "vext.32 q13, q11, q12, #3 \n" - "vmla.f32 q14, q13, %f[kr2][0] \n" - "vmla.f32 q15, q13, %f[kr1][0] \n" - "vmla.f32 q14, q12, %f[kr2][1] \n" - "vmla.f32 q15, q12, %f[kr1][1] \n" - - "vld1.32 {d14-d17}, [%[input_ptr3]], r0 \n" - "vld1.32 {d18-d21}, [%[input_ptr4]], r0 \n" - "vld1.32 {d22-d25}, [%[input_ptr5]], r0 \n" - "vmla.f32 q14, q7, %f[ker0][1] \n" - "vmla.f32 q15, q7, %f[ker0][0] \n" - "vext.32 q13, q7, q8, #1 \n" - "vmla.f32 q14, q13, %e[kr3][0] \n" - "vmla.f32 q15, q13, %e[kr2][0] \n" - "vext.32 q13, q7, q8, #2 \n" - "vmla.f32 q14, q13, %e[kr3][1] \n" - "vmla.f32 q15, q13, %e[kr2][1] \n" - "vext.32 q13, q7, q8, #3 \n" - "vmla.f32 q14, q13, %f[kr3][0] \n" - "vmla.f32 q15, q13, %f[kr2][0] \n" - "vmla.f32 q14, q8, %f[kr3][1] \n" - "vmla.f32 q15, q8, %f[kr2][1] \n" - - "vmla.f32 q14, q9, %e[ker1][0] \n" - "vmla.f32 q15, q9, %f[ker0][1] \n" - "vext.32 q13, q9, q10, #1 \n" - "vmla.f32 q14, q13, %e[kr4][0] \n" - "vmla.f32 q15, q13, %e[kr3][0] \n" - "vext.32 q13, q9, q10, #2 \n" - "vmla.f32 q14, q13, %e[kr4][1] \n" - "vmla.f32 q15, q13, %e[kr3][1] \n" - "vext.32 q13, q9, q10, #3 \n" - "vmla.f32 q14, q13, %f[kr4][0] \n" - "vmla.f32 q15, q13, %f[kr3][0] \n" - "vmla.f32 q14, q10, %f[kr4][1] \n" - "vmla.f32 q15, q10, %f[kr3][1] \n" - - "vmla.f32 q15, q11, %e[ker1][0] \n" - "vext.32 q13, q11, q12, #1 \n" - "vmla.f32 q15, q13, %e[kr4][0] \n" - "vext.32 q13, q11, q12, #2 \n" - "vmla.f32 q15, q13, %e[kr4][1] \n" - "vext.32 q13, q11, q12, #3 \n" - "vmla.f32 q15, q13, %f[kr4][0] \n" - "vmla.f32 q15, q12, %f[kr4][1] \n" - // restore output - "vst1.32 {q14}, [%[output_ptr0]]! \n" - "vst1.32 {q15}, [%[output_ptr1]]! \n" - "subs %[loop], #1 \n" - "bne loop_2h4w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - "mov r0, %[remain], lsl #2 \n" - "vld1.32 {d14-d17}, [%[input_ptr0]], r0 \n" - "vld1.32 {d18-d21}, [%[input_ptr1]], r0 \n" - "vld1.32 {d22-d25}, [%[input_ptr2]], r0 \n" - "vmul.f32 q14, q7, %e[ker0][0] \n" - "vext.32 q13, q7, q8, #1 \n" - "vmla.f32 q14, q13, %e[kr0][0] \n" - "vext.32 q13, q7, q8, #2 \n" - "vmla.f32 q14, q13, %e[kr0][1] \n" - "vext.32 q13, q7, q8, #3 \n" - "vmla.f32 q14, q13, %f[kr0][0] \n" - "vmla.f32 q14, q8, %f[kr0][1] \n" - - "vmla.f32 q14, q9, %e[ker0][1] \n" - "vmul.f32 q15, q9, %e[ker0][0] \n" - "vext.32 q13, q9, q10, #1 \n" - "vmla.f32 q14, q13, %e[kr1][0] \n" - "vmla.f32 q15, q13, %e[kr0][0] \n" - "vext.32 q13, q9, q10, #2 \n" - "vmla.f32 q14, q13, %e[kr1][1] \n" - "vmla.f32 q15, q13, %e[kr0][1] \n" - "vext.32 q13, q9, q10, #3 \n" - "vmla.f32 q14, q13, %f[kr1][0] \n" - "vmla.f32 q15, q13, %f[kr0][0] \n" - "vmla.f32 q14, q10, %f[kr1][1] \n" - "vmla.f32 q15, q10, %f[kr0][1] \n" - - "vmla.f32 q14, q11, %f[ker0][0] \n" - "vmla.f32 q15, q11, %e[ker0][1] \n" - "vext.32 q13, q11, q12, #1 \n" - "vmla.f32 q14, q13, %e[kr2][0] \n" - "vmla.f32 q15, q13, %e[kr1][0] \n" - "vext.32 q13, q11, q12, #2 \n" - "vmla.f32 q14, q13, %e[kr2][1] \n" - "vmla.f32 q15, q13, %e[kr1][1] \n" - "vext.32 q13, q11, q12, #3 \n" - "vmla.f32 q14, q13, %f[kr2][0] \n" - "vmla.f32 q15, q13, %f[kr1][0] \n" - "vmla.f32 q14, q12, %f[kr2][1] \n" - "vmla.f32 q15, q12, %f[kr1][1] \n" - - "vld1.32 {d14-d17}, [%[input_ptr3]], r0 \n" - "vld1.32 {d18-d21}, [%[input_ptr4]], r0 \n" - "vld1.32 {d22-d25}, [%[input_ptr5]], r0 \n" - "vmla.f32 q14, q7, %f[ker0][1] \n" - "vmla.f32 q15, q7, %f[ker0][0] \n" - "vext.32 q13, q7, q8, #1 \n" - "vmla.f32 q14, q13, %e[kr3][0] \n" - "vmla.f32 q15, q13, %e[kr2][0] \n" - "vext.32 q13, q7, q8, #2 \n" - "vmla.f32 q14, q13, %e[kr3][1] \n" - "vmla.f32 q15, q13, %e[kr2][1] \n" - "vext.32 q13, q7, q8, #3 \n" - "vmla.f32 q14, q13, %f[kr3][0] \n" - "vmla.f32 q15, q13, %f[kr2][0] \n" - "vmla.f32 q14, q8, %f[kr3][1] \n" - "vmla.f32 q15, q8, %f[kr2][1] \n" - - "vmla.f32 q14, q9, %e[ker1][0] \n" - "vmla.f32 q15, q9, %f[ker0][1] \n" - "vext.32 q13, q9, q10, #1 \n" - "vmla.f32 q14, q13, %e[kr4][0] \n" - "vmla.f32 q15, q13, %e[kr3][0] \n" - "vext.32 q13, q9, q10, #2 \n" - "vmla.f32 q14, q13, %e[kr4][1] \n" - "vmla.f32 q15, q13, %e[kr3][1] \n" - "vext.32 q13, q9, q10, #3 \n" - "vmla.f32 q14, q13, %f[kr4][0] \n" - "vmla.f32 q15, q13, %f[kr3][0] \n" - "vmla.f32 q14, q10, %f[kr4][1] \n" - "vmla.f32 q15, q10, %f[kr3][1] \n" - - "vmla.f32 q15, q11, %e[ker1][0] \n" - "vext.32 q13, q11, q12, #1 \n" - "vmla.f32 q15, q13, %e[kr4][0] \n" - "vext.32 q13, q11, q12, #2 \n" - "vmla.f32 q15, q13, %e[kr4][1] \n" - "vext.32 q13, q11, q12, #3 \n" - "vmla.f32 q15, q13, %f[kr4][0] \n" - "vmla.f32 q15, q12, %f[kr4][1] \n" - - "cmp %[remain], #2 \n" - "blt store_2h1w_%= \n" - "vst1.32 {d28}, [%[output_ptr0]]! \n" - "vst1.32 {d30}, [%[output_ptr1]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d29[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d31[0]}, [%[output_ptr1]]! \n" - "b end_%= \n" - - "store_2h1w_%=: \n" - "vst1.32 {d28[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d30[0]}, [%[output_ptr1]]! \n" - "end_%=: \n" - : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), - [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), - [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), - [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), - [loop] "+r"(loop) - : [remain] "r"(output_w_remain), [kr0] "w"(_ker[0]), - [kr1] "w"(_ker[1]), [kr2] "w"(_ker[2]), [kr3] "w"(_ker[3]), - [kr4] "w"(_ker[4]), [ker0] "w"(_ker[5]), [ker1] "w"(_ker[6]) - : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", - "q15", "r0"); -#endif // __aarch64__ - // pad right - if (padding_w) { - float32x4_t row0 = vld1q_f32(input_ptr0); - float32x4_t row1 = vld1q_f32(input_ptr1); - float32x4_t row2 = vld1q_f32(input_ptr2); - float32x4_t row3 = vld1q_f32(input_ptr3); - float32x4_t row4 = vld1q_f32(input_ptr4); - float32x4_t row5 = vld1q_f32(input_ptr5); - float32x4_t zero = vdupq_n_f32(0.f); - float32x4_t acc0, acc1; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 5 - (padding_w + input_w); - if (padding >= 5) { - *output_ptr0 = 0.f; - *output_ptr1 = 0.f; - } else { - int iw = w - valid_w_end; - float sum0 = input_ptr0[iw] * filter_ptr0[0] + - input_ptr1[iw] * filter_ptr1[0] + - input_ptr2[iw] * filter_ptr2[0] + - input_ptr3[iw] * filter_ptr3[0] + - input_ptr4[iw] * filter_ptr4[0]; - float sum1 = input_ptr1[iw] * filter_ptr0[0] + - input_ptr2[iw] * filter_ptr1[0] + - input_ptr3[iw] * filter_ptr2[0] + - input_ptr4[iw] * filter_ptr3[0] + - input_ptr5[iw] * filter_ptr4[0]; - row0 = vextq_f32(row0, zero, 1); - row1 = vextq_f32(row1, zero, 1); - row2 = vextq_f32(row2, zero, 1); - row3 = vextq_f32(row3, zero, 1); - row4 = vextq_f32(row4, zero, 1); - row5 = vextq_f32(row5, zero, 1); - acc0 = vmulq_f32(row0, _ker[0]); - acc0 = vmlaq_f32(acc0, row1, _ker[1]); - acc0 = vmlaq_f32(acc0, row2, _ker[2]); - acc0 = vmlaq_f32(acc0, row3, _ker[3]); - acc0 = vmlaq_f32(acc0, row4, _ker[4]); - acc1 = vmulq_f32(row1, _ker[0]); - acc1 = vmlaq_f32(acc1, row2, _ker[1]); - acc1 = vmlaq_f32(acc1, row3, _ker[2]); - acc1 = vmlaq_f32(acc1, row4, _ker[3]); - acc1 = vmlaq_f32(acc1, row5, _ker[4]); - acc0 = vpaddq_f32(acc0, acc1); - float32x2_t sum = - vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); - sum0 += vget_lane_f32(sum, 0); - sum1 += vget_lane_f32(sum, 1); - *output_ptr0 = sum0; - *output_ptr1 = sum1; - } - output_ptr0++; - output_ptr1++; - } - } - } - // remain height - int start_h = valid_h_start + (valid_h & 0xfffffffe); - if (start_h < valid_h_end) { - const float *input_ptr0 = input_ptr + (start_h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - const float *input_ptr3 = input_ptr2 + input_w; - const float *input_ptr4 = input_ptr3 + input_w; - float *output_ptr0 = output_ptr + start_h * output_w; - // pad left - if (padding_w) { - float32x4_t row0 = vld1q_f32(input_ptr0); - float32x4_t row1 = vld1q_f32(input_ptr1); - float32x4_t row2 = vld1q_f32(input_ptr2); - float32x4_t row3 = vld1q_f32(input_ptr3); - float32x4_t row4 = vld1q_f32(input_ptr4); - float32x4_t zero = vdupq_n_f32(0.f); - float32x4_t acc; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 5) { - output_ptr0[w] = 0.f; - } else { - acc = vmulq_f32(row0, _ker[0]); - acc = vmlaq_f32(acc, row1, _ker[1]); - acc = vmlaq_f32(acc, row2, _ker[2]); - acc = vmlaq_f32(acc, row3, _ker[3]); - acc = vmlaq_f32(acc, row4, _ker[4]); - float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc)); - sum = vpadd_f32(sum, sum); - vst1_lane_f32(output_ptr0 + w, sum, 0); - - row0 = vextq_f32(zero, row0, 3); - row1 = vextq_f32(zero, row1, 3); - row2 = vextq_f32(zero, row2, 3); - row3 = vextq_f32(zero, row3, 3); - row4 = vextq_f32(zero, row4, 3); - } - } - output_ptr0 += valid_w_start; - } - // valid -#if __aarch64__ - float32x4_t _q14; - for (int loop = 0; loop < output_w_tiles; ++loop) { - float32x4_t _q7 = vld1q_f32(input_ptr0); - float32x4_t _q8 = vld1q_f32(input_ptr0 + 4); - float32x4_t _q9 = vld1q_f32(input_ptr1); - float32x4_t _q10 = vld1q_f32(input_ptr1 + 4); - float32x4_t _q11 = vld1q_f32(input_ptr2); - float32x4_t _q12 = vld1q_f32(input_ptr2 + 4); - - _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0); - float32x4_t _q13 = vextq_f32(_q7, _q8, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0); - _q13 = vextq_f32(_q7, _q8, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1); - _q13 = vextq_f32(_q7, _q8, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0); - _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1); - _q13 = vextq_f32(_q9, _q10, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0); - _q13 = vextq_f32(_q9, _q10, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1); - _q13 = vextq_f32(_q9, _q10, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0); - _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0); - _q13 = vextq_f32(_q11, _q12, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0); - _q13 = vextq_f32(_q11, _q12, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1); - _q13 = vextq_f32(_q11, _q12, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0); - _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1); - - _q7 = vld1q_f32(input_ptr3); - _q8 = vld1q_f32(input_ptr3 + 4); - _q9 = vld1q_f32(input_ptr4); - _q10 = vld1q_f32(input_ptr4 + 4); - - _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1); - _q13 = vextq_f32(_q7, _q8, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0); - _q13 = vextq_f32(_q7, _q8, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1); - _q13 = vextq_f32(_q7, _q8, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0); - _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0); - _q13 = vextq_f32(_q9, _q10, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0); - _q13 = vextq_f32(_q9, _q10, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1); - _q13 = vextq_f32(_q9, _q10, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0); - _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1); - - vst1q_f32(output_ptr0, _q14); - - input_ptr0 += 4; - input_ptr1 += 4; - input_ptr2 += 4; - input_ptr3 += 4; - input_ptr4 += 4; - output_ptr0 += 4; - } - // remain w - if (output_w_remain > 0) { - float32x4_t _q7 = vld1q_f32(input_ptr0); - float32x4_t _q8 = vld1q_f32(input_ptr0 + 4); - float32x4_t _q9 = vld1q_f32(input_ptr1); - float32x4_t _q10 = vld1q_f32(input_ptr1 + 4); - float32x4_t _q11 = vld1q_f32(input_ptr2); - float32x4_t _q12 = vld1q_f32(input_ptr2 + 4); - - _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0); - float32x4_t _q13 = vextq_f32(_q7, _q8, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0); - _q13 = vextq_f32(_q7, _q8, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1); - _q13 = vextq_f32(_q7, _q8, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0); - _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1); - _q13 = vextq_f32(_q9, _q10, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0); - _q13 = vextq_f32(_q9, _q10, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1); - _q13 = vextq_f32(_q9, _q10, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0); - _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0); - _q13 = vextq_f32(_q11, _q12, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0); - _q13 = vextq_f32(_q11, _q12, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1); - _q13 = vextq_f32(_q11, _q12, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0); - _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1); - - _q7 = vld1q_f32(input_ptr3); - _q8 = vld1q_f32(input_ptr3 + 4); - _q9 = vld1q_f32(input_ptr4); - _q10 = vld1q_f32(input_ptr4 + 4); - - _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1); - _q13 = vextq_f32(_q7, _q8, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0); - _q13 = vextq_f32(_q7, _q8, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1); - _q13 = vextq_f32(_q7, _q8, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0); - _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1); - - _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0); - _q13 = vextq_f32(_q9, _q10, 1); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0); - _q13 = vextq_f32(_q9, _q10, 2); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1); - _q13 = vextq_f32(_q9, _q10, 3); - _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0); - _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1); - - switch (output_w_remain) { - case 3: - vst1q_lane_f32(output_ptr0 + 2, _q14, 2); - case 2: - vst1_f32(output_ptr0, vget_low_f32(_q14)); - break; - case 1: - vst1q_lane_f32(output_ptr0, _q14, 0); - break; - } - - input_ptr0 += output_w_remain; - input_ptr1 += output_w_remain; - input_ptr2 += output_w_remain; - input_ptr3 += output_w_remain; - input_ptr4 += output_w_remain; - output_ptr0 += output_w_remain; - } -#else - int loop = output_w_tiles; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - "mov r0, #16 \n" - "loop_1h4w_%=: \n" - "vld1.32 {d14-d17}, [%[input_ptr0]], r0 \n" - "vld1.32 {d18-d21}, [%[input_ptr1]], r0 \n" - "vld1.32 {d22-d25}, [%[input_ptr2]], r0 \n" - "vmul.f32 q14, q7, %e[ker0][0] \n" - "vext.32 q13, q7, q8, #1 \n" - "vmla.f32 q14, q13, %e[kr0][0] \n" - "vext.32 q13, q7, q8, #2 \n" - "vmla.f32 q14, q13, %e[kr0][1] \n" - "vext.32 q13, q7, q8, #3 \n" - "vmla.f32 q14, q13, %f[kr0][0] \n" - "vmla.f32 q14, q8, %f[kr0][1] \n" - - "vmla.f32 q14, q9, %e[ker0][1] \n" - "vext.32 q13, q9, q10, #1 \n" - "vmla.f32 q14, q13, %e[kr1][0] \n" - "vext.32 q13, q9, q10, #2 \n" - "vmla.f32 q14, q13, %e[kr1][1] \n" - "vext.32 q13, q9, q10, #3 \n" - "vmla.f32 q14, q13, %f[kr1][0] \n" - "vmla.f32 q14, q10, %f[kr1][1] \n" - - "vmla.f32 q14, q11, %f[ker0][0] \n" - "vext.32 q13, q11, q12, #1 \n" - "vmla.f32 q14, q13, %e[kr2][0] \n" - "vext.32 q13, q11, q12, #2 \n" - "vmla.f32 q14, q13, %e[kr2][1] \n" - "vext.32 q13, q11, q12, #3 \n" - "vmla.f32 q14, q13, %f[kr2][0] \n" - "vmla.f32 q14, q12, %f[kr2][1] \n" - - "vld1.32 {d14-d17}, [%[input_ptr3]], r0 \n" - "vld1.32 {d18-d21}, [%[input_ptr4]], r0 \n" - "vmla.f32 q14, q7, %f[ker0][1] \n" - "vext.32 q13, q7, q8, #1 \n" - "vmla.f32 q14, q13, %e[kr3][0] \n" - "vext.32 q13, q7, q8, #2 \n" - "vmla.f32 q14, q13, %e[kr3][1] \n" - "vext.32 q13, q7, q8, #3 \n" - "vmla.f32 q14, q13, %f[kr3][0] \n" - "vmla.f32 q14, q8, %f[kr3][1] \n" - - "vmla.f32 q14, q9, %e[ker1][0] \n" - "vext.32 q13, q9, q10, #1 \n" - "vmla.f32 q14, q13, %e[kr4][0] \n" - "vext.32 q13, q9, q10, #2 \n" - "vmla.f32 q14, q13, %e[kr4][1] \n" - "vext.32 q13, q9, q10, #3 \n" - "vmla.f32 q14, q13, %f[kr4][0] \n" - "vmla.f32 q14, q10, %f[kr4][1] \n" - - // restore output - "vst1.32 {q14}, [%[output_ptr0]]! \n" - "subs %[loop], #1 \n" - "bne loop_1h4w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - "mov r0, %[remain], lsl #2 \n" - "vld1.32 {d14-d17}, [%[input_ptr0]], r0 \n" - "vld1.32 {d18-d21}, [%[input_ptr1]], r0 \n" - "vld1.32 {d22-d25}, [%[input_ptr2]], r0 \n" - "vmul.f32 q14, q7, %e[ker0][0] \n" - "vext.32 q13, q7, q8, #1 \n" - "vmla.f32 q14, q13, %e[kr0][0] \n" - "vext.32 q13, q7, q8, #2 \n" - "vmla.f32 q14, q13, %e[kr0][1] \n" - "vext.32 q13, q7, q8, #3 \n" - "vmla.f32 q14, q13, %f[kr0][0] \n" - "vmla.f32 q14, q8, %f[kr0][1] \n" - - "vmla.f32 q14, q9, %e[ker0][1] \n" - "vext.32 q13, q9, q10, #1 \n" - "vmla.f32 q14, q13, %e[kr1][0] \n" - "vext.32 q13, q9, q10, #2 \n" - "vmla.f32 q14, q13, %e[kr1][1] \n" - "vext.32 q13, q9, q10, #3 \n" - "vmla.f32 q14, q13, %f[kr1][0] \n" - "vmla.f32 q14, q10, %f[kr1][1] \n" - - "vmla.f32 q14, q11, %f[ker0][0] \n" - "vext.32 q13, q11, q12, #1 \n" - "vmla.f32 q14, q13, %e[kr2][0] \n" - "vext.32 q13, q11, q12, #2 \n" - "vmla.f32 q14, q13, %e[kr2][1] \n" - "vext.32 q13, q11, q12, #3 \n" - "vmla.f32 q14, q13, %f[kr2][0] \n" - "vmla.f32 q14, q12, %f[kr2][1] \n" - - "vld1.32 {d14-d17}, [%[input_ptr3]], r0 \n" - "vld1.32 {d18-d21}, [%[input_ptr4]], r0 \n" - "vmla.f32 q14, q7, %f[ker0][1] \n" - "vext.32 q13, q7, q8, #1 \n" - "vmla.f32 q14, q13, %e[kr3][0] \n" - "vext.32 q13, q7, q8, #2 \n" - "vmla.f32 q14, q13, %e[kr3][1] \n" - "vext.32 q13, q7, q8, #3 \n" - "vmla.f32 q14, q13, %f[kr3][0] \n" - "vmla.f32 q14, q8, %f[kr3][1] \n" - - "vmla.f32 q14, q9, %e[ker1][0] \n" - "vext.32 q13, q9, q10, #1 \n" - "vmla.f32 q14, q13, %e[kr4][0] \n" - "vext.32 q13, q9, q10, #2 \n" - "vmla.f32 q14, q13, %e[kr4][1] \n" - "vext.32 q13, q9, q10, #3 \n" - "vmla.f32 q14, q13, %f[kr4][0] \n" - "vmla.f32 q14, q10, %f[kr4][1] \n" - - "cmp %[remain], #2 \n" - "blt store_1h1w_%= \n" - "vst1.32 {d28}, [%[output_ptr0]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d29[0]}, [%[output_ptr0]]! \n" - "b end_%= \n" - - "store_1h1w_%=: \n" - "vst1.32 {d28[0]}, [%[output_ptr0]]! \n" - "end_%=: \n" - : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), - [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), - [input_ptr4] "+r"(input_ptr4), [output_ptr0] "+r"(output_ptr0), - [loop] "+r"(loop) - : [remain] "r"(output_w_remain), [kr0] "w"(_ker[0]), - [kr1] "w"(_ker[1]), [kr2] "w"(_ker[2]), [kr3] "w"(_ker[3]), - [kr4] "w"(_ker[4]), [ker0] "w"(_ker[5]), [ker1] "w"(_ker[6]) - : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", - "q15", "r0"); -#endif // __aarch64__ - // pad right - if (padding_w) { - float32x4_t row0 = vld1q_f32(input_ptr0); - float32x4_t row1 = vld1q_f32(input_ptr1); - float32x4_t row2 = vld1q_f32(input_ptr2); - float32x4_t row3 = vld1q_f32(input_ptr3); - float32x4_t row4 = vld1q_f32(input_ptr4); - float32x4_t zero = vdupq_n_f32(0.f); - float32x4_t acc; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 5 - (padding_w + input_w); - if (padding >= 5) { - *output_ptr0 = 0.f; - } else { - int iw = w - valid_w_end; - float sum0 = input_ptr0[iw] * filter_ptr0[0] + - input_ptr1[iw] * filter_ptr1[0] + - input_ptr2[iw] * filter_ptr2[0] + - input_ptr3[iw] * filter_ptr3[0] + - input_ptr4[iw] * filter_ptr4[0]; - row0 = vextq_f32(row0, zero, 1); - row1 = vextq_f32(row1, zero, 1); - row2 = vextq_f32(row2, zero, 1); - row3 = vextq_f32(row3, zero, 1); - row4 = vextq_f32(row4, zero, 1); - acc = vmulq_f32(row0, _ker[0]); - acc = vmlaq_f32(acc, row1, _ker[1]); - acc = vmlaq_f32(acc, row2, _ker[2]); - acc = vmlaq_f32(acc, row3, _ker[3]); - acc = vmlaq_f32(acc, row4, _ker[4]); - float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc)); - sum = vpadd_f32(sum, sum); - sum0 += vget_lane_f32(sum, 0); - *output_ptr0 = sum0; - } - output_ptr0++; - } - } - } - // pad bottom - for (int h = valid_h_end; h < output_h; ++h) { - DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker, _ker1); - } - } -} - -template <> -void DepthwiseConv5x5S2(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) {} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // __ARM_NEON__ diff --git a/mobile/src/operators/math/depthwise_conv5x5.h b/mobile/src/operators/math/depthwise_conv5x5.h deleted file mode 100644 index 11d96b078a..0000000000 --- a/mobile/src/operators/math/depthwise_conv5x5.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -// TODO(hjchen2) need to be implemented -// template -// void DepthwiseConv5x5(const framework::Tensor *input, -// const framework::Tensor *filter, -// const std::vector &strides, -// const std::vector &paddings, -// framework::Tensor *output); - -template -void DepthwiseConv5x5S1(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output); - -template -void DepthwiseConv5x5S2(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output); - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/depthwise_conv5x5_int8.cpp b/mobile/src/operators/math/depthwise_conv5x5_int8.cpp deleted file mode 100644 index 1e9482beb4..0000000000 --- a/mobile/src/operators/math/depthwise_conv5x5_int8.cpp +++ /dev/null @@ -1,1041 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(__ARM_NEON__) && !defined(__aarch64__) - -#include -#include "operators/math/depthwise_conv5x5.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -#ifndef __aarch64__ -inline int32x4_t vpaddq_s32(int32x4_t r0, int32x4_t r1) { - int32x2_t sum0 = vpadd_s32(vget_low_s32(r0), vget_high_s32(r0)); - int32x2_t sum1 = vpadd_s32(vget_low_s32(r1), vget_high_s32(r1)); - return vcombine_s32(sum0, sum1); -} -#endif - -template -inline void Depth5x5NormalRowLoadInput(const int8_t *input, int16x4_t *y) { - int16x8_t x = vmovl_s8(vld1_s8(input)); - y[0] = vget_low_s16(x); - y[4] = vget_high_s16(x); - y[1] = vext_s16(y[0], y[4], 1); - y[2] = vext_s16(y[0], y[4], 2); - y[3] = vext_s16(y[0], y[4], 3); -} - -template <> -inline void Depth5x5NormalRowLoadInput<2>(const int8_t *input, int16x4_t *y) { - int8x8x2_t x = vld2_s8(input); - y[0] = vget_low_s16(vmovl_s8(x.val[0])); - y[1] = vget_low_s16(vmovl_s8(x.val[1])); - y[2] = vext_s16(y[0], y[0], 1); - y[3] = vext_s16(y[1], y[1], 1); - y[4] = vext_s16(y[0], y[0], 2); -} - -#define DEPTHWISE_CONV_NORMAL_BORDER(start, end) \ - for (int w = start; w < end; ++w) { \ - const int w_in_start = -padding_w + w * Stride_w; \ - const int w_in_end = w_in_start + 5; \ - const int w_start = w_in_start > 0 ? w_in_start : 0; \ - const int w_end = w_in_end < input_w ? w_in_end : input_w; \ - int32_t value = 0; \ - for (int h_in = h_start; h_in < h_end; ++h_in) { \ - for (int w_in = w_start; w_in < w_end; ++w_in) { \ - value += filter[(h_in - h_in_start) * 5 + (w_in - w_in_start)] * \ - input[h_in * input_w + w_in]; \ - } \ - } \ - output_ptr[w] = value; \ - } - -template -inline void DepthwiseConv5x5NormalRow(const int8_t *input, const int8_t *filter, - const int h_output, const int input_h, - const int input_w, const int padding_h, - const int padding_w, const int output_w, - int32_t *output, int16x4_t *ker, - int16_t *ker1) { - const int h_in_start = -padding_h + h_output * Stride_h; - const int h_in_end = h_in_start + 5; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int h_end = h_in_end < input_h ? h_in_end : input_h; - - int valid_w_start = (padding_w + Stride_w - 1) / Stride_w; - int valid_w_end = output_w - valid_w_start; - int32_t *output_ptr = output + h_output * output_w; - // border left - DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start) - // middle - int output_tiles = (valid_w_end - valid_w_start) >> 2; - int16x4_t _x[5]; - int32x4_t _sum; - // valid w - for (int w = 0; w < output_tiles * 4; w += 4) { - _sum = vdupq_n_s32(0); - int output_offset = valid_w_start + w; - int input_w_offset = output_offset * Stride_w - padding_w; - for (int h_in = h_start; h_in < h_end; ++h_in) { - int index = h_in - h_in_start; - Depth5x5NormalRowLoadInput( - input + h_in * input_w + input_w_offset, _x); - _sum = vmlal_n_s16(_sum, _x[0], ker1[index]); - _sum = vmlal_lane_s16(_sum, _x[1], ker[index], 0); - _sum = vmlal_lane_s16(_sum, _x[2], ker[index], 1); - _sum = vmlal_lane_s16(_sum, _x[3], ker[index], 2); - _sum = vmlal_lane_s16(_sum, _x[4], ker[index], 3); - } - vst1q_s32(output_ptr + output_offset, _sum); - } - // remain valid w - int remain = (valid_w_end - valid_w_start) & 0x3; - if (remain > 0) { - _sum = vdupq_n_s32(0); - int remain_start = valid_w_start + (output_tiles << 2); - int input_w_offset = remain_start * Stride_w - padding_w; - int32_t *output_ptr0 = output_ptr + remain_start; - - for (int h_in = h_start; h_in < h_end; ++h_in) { - int index = h_in - h_in_start; - Depth5x5NormalRowLoadInput( - input + h_in * input_w + input_w_offset, _x); - _sum = vmlal_n_s16(_sum, _x[0], ker1[index]); - _sum = vmlal_lane_s16(_sum, _x[1], ker[index], 0); - _sum = vmlal_lane_s16(_sum, _x[2], ker[index], 1); - _sum = vmlal_lane_s16(_sum, _x[3], ker[index], 2); - _sum = vmlal_lane_s16(_sum, _x[4], ker[index], 3); - } - switch (remain) { - case 1: - vst1_lane_s32(output_ptr0, vget_low_s32(_sum), 0); - break; - case 2: - vst1_s32(output_ptr0, vget_low_s32(_sum)); - break; - case 3: - vst1_s32(output_ptr0, vget_low_s32(_sum)); - vst1_lane_s32(output_ptr0 + 2, vget_high_s32(_sum), 0); - break; - } - } - // border right - DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w) -} - -template <> -void DepthwiseConv5x5S1(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) { - const int8_t *input_data = input.data(); - const int8_t *filter_data = filter.data(); - int32_t *out_data = output->mutable_data(); - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int padding_h = paddings[0]; - int padding_w = paddings[1]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - int valid_h_start = padding_h; - int valid_h_end = output_h - valid_h_start; - int valid_h = valid_h_end - valid_h_start; - int valid_w_start = padding_w; - int valid_w_end = output_w - valid_w_start; - int valid_w = valid_w_end - valid_w_start; - - #pragma omp parallel for - for (int g = 0; g < input.dims()[1]; ++g) { - const int8_t *input_ptr = input_data + g * image_size; - const int8_t *filter_ptr = filter_data + g * 25; - int32_t *output_ptr = out_data + g * out_image_size; - - const int8_t *filter_ptr0 = filter_ptr; - const int8_t *filter_ptr1 = filter_ptr0 + 5; - const int8_t *filter_ptr2 = filter_ptr1 + 5; - const int8_t *filter_ptr3 = filter_ptr2 + 5; - const int8_t *filter_ptr4 = filter_ptr3 + 5; - int16_t kernel[5] = {*filter_ptr0, *filter_ptr1, *filter_ptr2, *filter_ptr3, - *filter_ptr4}; - int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0 + 1))); - int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1 + 1))); - int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2 + 1))); - int16x4_t _k3 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr3 + 1))); - int16x4_t _k4 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr4 + 1))); - int16x4_t _k5 = vld1_s16(kernel); - int16x4_t _k6 = vld1_s16(kernel + 4); - int16x8_t _ker0 = vcombine_s16(_k0, _k1); - int16x8_t _ker1 = vcombine_s16(_k2, _k3); - int16x8_t _ker2 = vcombine_s16(_k4, _k5); - int16x8_t _ker3 = vcombine_s16(_k6, _k6); - int16x4_t _ker[7] = {_k0, _k1, _k2, _k3, _k4, _k5, _k6}; - - // pad top - for (int h = 0; h < valid_h_start; ++h) { - DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker, kernel); - } - - // output 4x4 - int output_w_tiles = valid_w / 8; - int output_w_remain = valid_w - output_w_tiles * 8; - for (int h = valid_h_start; h < valid_h_end - 1; h += 2) { - const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const int8_t *input_ptr1 = input_ptr0 + input_w; - const int8_t *input_ptr2 = input_ptr1 + input_w; - const int8_t *input_ptr3 = input_ptr2 + input_w; - const int8_t *input_ptr4 = input_ptr3 + input_w; - const int8_t *input_ptr5 = input_ptr4 + input_w; - int32_t *output_ptr0 = output_ptr + h * output_w; - int32_t *output_ptr1 = output_ptr0 + output_w; - // pad left - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3))); - int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4))); - int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5))); - int16x4_t zero = vdup_n_s16(0); - int32x4_t acc0, acc1; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 5) { - output_ptr0[w] = 0; - output_ptr1[w] = 0; - } else { - acc0 = vmull_s16(row0, _ker[0]); - acc0 = vmlal_s16(acc0, row1, _ker[1]); - acc0 = vmlal_s16(acc0, row2, _ker[2]); - acc0 = vmlal_s16(acc0, row3, _ker[3]); - acc0 = vmlal_s16(acc0, row4, _ker[4]); - acc1 = vmull_s16(row1, _ker[0]); - acc1 = vmlal_s16(acc1, row2, _ker[1]); - acc1 = vmlal_s16(acc1, row3, _ker[2]); - acc1 = vmlal_s16(acc1, row4, _ker[3]); - acc1 = vmlal_s16(acc1, row5, _ker[4]); - acc0 = vpaddq_s32(acc0, acc1); - int32x2_t sum = vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0)); - vst1_lane_s32(output_ptr0 + w, sum, 0); - vst1_lane_s32(output_ptr1 + w, sum, 1); - - row0 = vext_s16(zero, row0, 3); - row1 = vext_s16(zero, row1, 3); - row2 = vext_s16(zero, row2, 3); - row3 = vext_s16(zero, row3, 3); - row4 = vext_s16(zero, row4, 3); - row5 = vext_s16(zero, row5, 3); - } - } - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - } - // valid - int loop = output_w_tiles; - int w_remain = output_w_remain; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain4_%= \n" - "mov r0, #8 \n" - "loop_2h8w_%=: \n" - "vld1.s8 {d10-d11}, [%[input_ptr0]], r0 \n" - "vld1.s8 {d12-d13}, [%[input_ptr1]], r0 \n" - "vld1.s8 {d14-d15}, [%[input_ptr2]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmull.s16 q12, d16, %f[ker2][0] \n" - "vmull.s16 q13, d17, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker0][0] \n" - "vmlal.s16 q13, d21, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker0][1] \n" - "vmlal.s16 q13, d21, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker0][2] \n" - "vmlal.s16 q13, d21, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker0][3] \n" - "vmlal.s16 q13, d21, %e[ker0][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q12, d16, %f[ker2][1] \n" - "vmlal.s16 q13, d17, %f[ker2][1] \n" - "vmull.s16 q14, d16, %f[ker2][0] \n" - "vmull.s16 q15, d17, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker0][0] \n" - "vmlal.s16 q13, d21, %f[ker0][0] \n" - "vmlal.s16 q14, d20, %e[ker0][0] \n" - "vmlal.s16 q15, d21, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker0][1] \n" - "vmlal.s16 q13, d21, %f[ker0][1] \n" - "vmlal.s16 q14, d20, %e[ker0][1] \n" - "vmlal.s16 q15, d21, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker0][2] \n" - "vmlal.s16 q13, d21, %f[ker0][2] \n" - "vmlal.s16 q14, d20, %e[ker0][2] \n" - "vmlal.s16 q15, d21, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker0][3] \n" - "vmlal.s16 q13, d21, %f[ker0][3] \n" - "vmlal.s16 q14, d20, %e[ker0][3] \n" - "vmlal.s16 q15, d21, %e[ker0][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmovl.s8 q9, d15 \n" - "vmlal.s16 q12, d16, %f[ker2][2] \n" - "vmlal.s16 q13, d17, %f[ker2][2] \n" - "vmlal.s16 q14, d16, %f[ker2][1] \n" - "vmlal.s16 q15, d17, %f[ker2][1] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker1][0] \n" - "vmlal.s16 q13, d21, %e[ker1][0] \n" - "vmlal.s16 q14, d20, %f[ker0][0] \n" - "vmlal.s16 q15, d21, %f[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker1][1] \n" - "vmlal.s16 q13, d21, %e[ker1][1] \n" - "vmlal.s16 q14, d20, %f[ker0][1] \n" - "vmlal.s16 q15, d21, %f[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker1][2] \n" - "vmlal.s16 q13, d21, %e[ker1][2] \n" - "vmlal.s16 q14, d20, %f[ker0][2] \n" - "vmlal.s16 q15, d21, %f[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker1][3] \n" - "vmlal.s16 q13, d21, %e[ker1][3] \n" - "vmlal.s16 q14, d20, %f[ker0][3] \n" - "vmlal.s16 q15, d21, %f[ker0][3] \n" - - "vld1.s8 {d10-d11}, [%[input_ptr3]], r0 \n" - "vld1.s8 {d12-d13}, [%[input_ptr4]], r0 \n" - "vld1.s8 {d14-d15}, [%[input_ptr5]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmlal.s16 q12, d16, %f[ker2][3] \n" - "vmlal.s16 q13, d17, %f[ker2][3] \n" - "vmlal.s16 q14, d16, %f[ker2][2] \n" - "vmlal.s16 q15, d17, %f[ker2][2] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker1][0] \n" - "vmlal.s16 q13, d21, %f[ker1][0] \n" - "vmlal.s16 q14, d20, %e[ker1][0] \n" - "vmlal.s16 q15, d21, %e[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker1][1] \n" - "vmlal.s16 q13, d21, %f[ker1][1] \n" - "vmlal.s16 q14, d20, %e[ker1][1] \n" - "vmlal.s16 q15, d21, %e[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker1][2] \n" - "vmlal.s16 q13, d21, %f[ker1][2] \n" - "vmlal.s16 q14, d20, %e[ker1][2] \n" - "vmlal.s16 q15, d21, %e[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker1][3] \n" - "vmlal.s16 q13, d21, %f[ker1][3] \n" - "vmlal.s16 q14, d20, %e[ker1][3] \n" - "vmlal.s16 q15, d21, %e[ker1][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q12, d16, %e[ker3][0] \n" - "vmlal.s16 q13, d17, %e[ker3][0] \n" - "vmlal.s16 q14, d16, %f[ker2][3] \n" - "vmlal.s16 q15, d17, %f[ker2][3] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker2][0] \n" - "vmlal.s16 q13, d21, %e[ker2][0] \n" - "vmlal.s16 q14, d20, %f[ker1][0] \n" - "vmlal.s16 q15, d21, %f[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker2][1] \n" - "vmlal.s16 q13, d21, %e[ker2][1] \n" - "vmlal.s16 q14, d20, %f[ker1][1] \n" - "vmlal.s16 q15, d21, %f[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker2][2] \n" - "vmlal.s16 q13, d21, %e[ker2][2] \n" - "vmlal.s16 q14, d20, %f[ker1][2] \n" - "vmlal.s16 q15, d21, %f[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker2][3] \n" - "vmlal.s16 q13, d21, %e[ker2][3] \n" - "vmlal.s16 q14, d20, %f[ker1][3] \n" - "vmlal.s16 q15, d21, %f[ker1][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmovl.s8 q9, d15 \n" - "vmlal.s16 q14, d16, %e[ker3][0] \n" - "vmlal.s16 q15, d17, %e[ker3][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q14, d20, %e[ker2][0] \n" - "vmlal.s16 q15, d21, %e[ker2][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q14, d20, %e[ker2][1] \n" - "vmlal.s16 q15, d21, %e[ker2][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q14, d20, %e[ker2][2] \n" - "vmlal.s16 q15, d21, %e[ker2][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q14, d20, %e[ker2][3] \n" - "vmlal.s16 q15, d21, %e[ker2][3] \n" - - // restore output - "vst1.32 {q12-q13}, [%[output_ptr0]]! \n" - "vst1.32 {q14-q15}, [%[output_ptr1]]! \n" - "subs %[loop], #1 \n" - "bne loop_2h8w_%= \n" - - "start_remain4_%=: \n" - "cmp %[remain], #4 \n" - "blt start_remain_%= \n" - "mov r0, #4 \n" - "vld1.s8 {d10}, [%[input_ptr0]], r0 \n" - "vld1.s8 {d12}, [%[input_ptr1]], r0 \n" - "vld1.s8 {d14}, [%[input_ptr2]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmull.s16 q12, d16, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker0][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmlal.s16 q12, d16, %f[ker2][1] \n" - "vmull.s16 q14, d16, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker0][0] \n" - "vmlal.s16 q14, d20, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker0][1] \n" - "vmlal.s16 q14, d20, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker0][2] \n" - "vmlal.s16 q14, d20, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker0][3] \n" - "vmlal.s16 q14, d20, %e[ker0][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmlal.s16 q12, d16, %f[ker2][2] \n" - "vmlal.s16 q14, d16, %f[ker2][1] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker1][0] \n" - "vmlal.s16 q14, d20, %f[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker1][1] \n" - "vmlal.s16 q14, d20, %f[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker1][2] \n" - "vmlal.s16 q14, d20, %f[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker1][3] \n" - "vmlal.s16 q14, d20, %f[ker0][3] \n" - - "vld1.s8 {d10}, [%[input_ptr3]], r0 \n" - "vld1.s8 {d12}, [%[input_ptr4]], r0 \n" - "vld1.s8 {d14}, [%[input_ptr5]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmlal.s16 q12, d16, %f[ker2][3] \n" - "vmlal.s16 q14, d16, %f[ker2][2] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker1][0] \n" - "vmlal.s16 q14, d20, %e[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker1][1] \n" - "vmlal.s16 q14, d20, %e[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker1][2] \n" - "vmlal.s16 q14, d20, %e[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker1][3] \n" - "vmlal.s16 q14, d20, %e[ker1][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmlal.s16 q12, d16, %e[ker3][0] \n" - "vmlal.s16 q14, d16, %f[ker2][3] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker2][0] \n" - "vmlal.s16 q14, d20, %f[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker2][1] \n" - "vmlal.s16 q14, d20, %f[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker2][2] \n" - "vmlal.s16 q14, d20, %f[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker2][3] \n" - "vmlal.s16 q14, d20, %f[ker1][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmlal.s16 q14, d16, %e[ker3][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q14, d20, %e[ker2][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q14, d20, %e[ker2][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q14, d20, %e[ker2][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q14, d20, %e[ker2][3] \n" - - // restore output - "vst1.32 {d24-d25}, [%[output_ptr0]]! \n" - "vst1.32 {d28-d29}, [%[output_ptr1]]! \n" - "sub %[remain], #4 \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - "mov r0, %[remain] \n" - "vld1.s8 {d10}, [%[input_ptr0]], r0 \n" - "vld1.s8 {d12}, [%[input_ptr1]], r0 \n" - "vld1.s8 {d14}, [%[input_ptr2]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmull.s16 q12, d16, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker0][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmlal.s16 q12, d16, %f[ker2][1] \n" - "vmull.s16 q14, d16, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker0][0] \n" - "vmlal.s16 q14, d20, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker0][1] \n" - "vmlal.s16 q14, d20, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker0][2] \n" - "vmlal.s16 q14, d20, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker0][3] \n" - "vmlal.s16 q14, d20, %e[ker0][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmlal.s16 q12, d16, %f[ker2][2] \n" - "vmlal.s16 q14, d16, %f[ker2][1] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker1][0] \n" - "vmlal.s16 q14, d20, %f[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker1][1] \n" - "vmlal.s16 q14, d20, %f[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker1][2] \n" - "vmlal.s16 q14, d20, %f[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker1][3] \n" - "vmlal.s16 q14, d20, %f[ker0][3] \n" - - "vld1.s8 {d10}, [%[input_ptr3]], r0 \n" - "vld1.s8 {d12}, [%[input_ptr4]], r0 \n" - "vld1.s8 {d14}, [%[input_ptr5]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmlal.s16 q12, d16, %f[ker2][3] \n" - "vmlal.s16 q14, d16, %f[ker2][2] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker1][0] \n" - "vmlal.s16 q14, d20, %e[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker1][1] \n" - "vmlal.s16 q14, d20, %e[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker1][2] \n" - "vmlal.s16 q14, d20, %e[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker1][3] \n" - "vmlal.s16 q14, d20, %e[ker1][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmlal.s16 q12, d16, %e[ker3][0] \n" - "vmlal.s16 q14, d16, %f[ker2][3] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker2][0] \n" - "vmlal.s16 q14, d20, %f[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker2][1] \n" - "vmlal.s16 q14, d20, %f[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker2][2] \n" - "vmlal.s16 q14, d20, %f[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker2][3] \n" - "vmlal.s16 q14, d20, %f[ker1][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmlal.s16 q14, d16, %e[ker3][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q14, d20, %e[ker2][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q14, d20, %e[ker2][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q14, d20, %e[ker2][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q14, d20, %e[ker2][3] \n" - - "cmp %[remain], #2 \n" - "blt store_2h1w_%= \n" - "vst1.32 {d24}, [%[output_ptr0]]! \n" - "vst1.32 {d28}, [%[output_ptr1]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d25[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d29[0]}, [%[output_ptr1]]! \n" - "b end_%= \n" - - "store_2h1w_%=: \n" - "vst1.32 {d24[0]}, [%[output_ptr0]]! \n" - "vst1.32 {d28[0]}, [%[output_ptr1]]! \n" - "end_%=: \n" - : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), - [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), - [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), - [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), - [loop] "+r"(loop), [remain] "+r"(w_remain) - : [ker0] "w"(_ker0), [ker1] "w"(_ker1), [ker2] "w"(_ker2), - [ker3] "w"(_ker3) - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15", "r0"); - // pad right - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3))); - int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4))); - int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5))); - int16x4_t zero = vdup_n_s16(0); - int32x4_t acc0, acc1; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 5 - (padding_w + input_w); - if (padding >= 5) { - *output_ptr0 = 0; - *output_ptr1 = 0; - } else { - int iw = w - valid_w_end; - int32_t sum0 = input_ptr0[iw] * filter_ptr0[0] + - input_ptr1[iw] * filter_ptr1[0] + - input_ptr2[iw] * filter_ptr2[0] + - input_ptr3[iw] * filter_ptr3[0] + - input_ptr4[iw] * filter_ptr4[0]; - int32_t sum1 = input_ptr1[iw] * filter_ptr0[0] + - input_ptr2[iw] * filter_ptr1[0] + - input_ptr3[iw] * filter_ptr2[0] + - input_ptr4[iw] * filter_ptr3[0] + - input_ptr5[iw] * filter_ptr4[0]; - row0 = vext_s16(row0, zero, 1); - row1 = vext_s16(row1, zero, 1); - row2 = vext_s16(row2, zero, 1); - row3 = vext_s16(row3, zero, 1); - row4 = vext_s16(row4, zero, 1); - row5 = vext_s16(row5, zero, 1); - acc0 = vmull_s16(row0, _ker[0]); - acc0 = vmlal_s16(acc0, row1, _ker[1]); - acc0 = vmlal_s16(acc0, row2, _ker[2]); - acc0 = vmlal_s16(acc0, row3, _ker[3]); - acc0 = vmlal_s16(acc0, row4, _ker[4]); - acc1 = vmull_s16(row1, _ker[0]); - acc1 = vmlal_s16(acc1, row2, _ker[1]); - acc1 = vmlal_s16(acc1, row3, _ker[2]); - acc1 = vmlal_s16(acc1, row4, _ker[3]); - acc1 = vmlal_s16(acc1, row5, _ker[4]); - acc0 = vpaddq_s32(acc0, acc1); - int32x2_t sum = vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0)); - sum0 += vget_lane_s32(sum, 0); - sum1 += vget_lane_s32(sum, 1); - *output_ptr0 = sum0; - *output_ptr1 = sum1; - } - output_ptr0++; - output_ptr1++; - } - } - } - // remain height - int start_h = valid_h_start + (valid_h & 0xfffffffe); - if (start_h < valid_h_end) { - const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w; - const int8_t *input_ptr1 = input_ptr0 + input_w; - const int8_t *input_ptr2 = input_ptr1 + input_w; - const int8_t *input_ptr3 = input_ptr2 + input_w; - const int8_t *input_ptr4 = input_ptr3 + input_w; - int32_t *output_ptr0 = output_ptr + start_h * output_w; - // pad left - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3))); - int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4))); - int16x4_t zero = vdup_n_s16(0); - int32x4_t acc; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 5) { - output_ptr0[w] = 0; - } else { - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - acc = vmlal_s16(acc, row3, _ker[3]); - acc = vmlal_s16(acc, row4, _ker[4]); - int32x2_t sum = vpadd_s32(vget_low_s32(acc), vget_high_s32(acc)); - sum = vpadd_s32(sum, sum); - vst1_lane_s32(output_ptr0 + w, sum, 0); - - row0 = vext_s16(zero, row0, 3); - row1 = vext_s16(zero, row1, 3); - row2 = vext_s16(zero, row2, 3); - row3 = vext_s16(zero, row3, 3); - row4 = vext_s16(zero, row4, 3); - } - } - output_ptr0 += valid_w_start; - } - // valid - int loop = output_w_tiles; - int w_remain = output_w_remain; - asm volatile( - "cmp %[loop], #0 \n" - "ble start_remain4_%= \n" - "mov r0, #8 \n" - "loop_1h8w_%=: \n" - "vld1.s8 {d10-d11}, [%[input_ptr0]], r0 \n" - "vld1.s8 {d12-d13}, [%[input_ptr1]], r0 \n" - "vld1.s8 {d14-d15}, [%[input_ptr2]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmull.s16 q12, d16, %f[ker2][0] \n" - "vmull.s16 q13, d17, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker0][0] \n" - "vmlal.s16 q13, d21, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker0][1] \n" - "vmlal.s16 q13, d21, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker0][2] \n" - "vmlal.s16 q13, d21, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker0][3] \n" - "vmlal.s16 q13, d21, %e[ker0][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q12, d16, %f[ker2][1] \n" - "vmlal.s16 q13, d17, %f[ker2][1] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker0][0] \n" - "vmlal.s16 q13, d21, %f[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker0][1] \n" - "vmlal.s16 q13, d21, %f[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker0][2] \n" - "vmlal.s16 q13, d21, %f[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker0][3] \n" - "vmlal.s16 q13, d21, %f[ker0][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmovl.s8 q9, d15 \n" - "vmlal.s16 q12, d16, %f[ker2][2] \n" - "vmlal.s16 q13, d17, %f[ker2][2] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker1][0] \n" - "vmlal.s16 q13, d21, %e[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker1][1] \n" - "vmlal.s16 q13, d21, %e[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker1][2] \n" - "vmlal.s16 q13, d21, %e[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker1][3] \n" - "vmlal.s16 q13, d21, %e[ker1][3] \n" - - "vld1.s8 {d10-d11}, [%[input_ptr3]], r0 \n" - "vld1.s8 {d12-d13}, [%[input_ptr4]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmovl.s8 q9, d11 \n" - "vmlal.s16 q12, d16, %f[ker2][3] \n" - "vmlal.s16 q13, d17, %f[ker2][3] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker1][0] \n" - "vmlal.s16 q13, d21, %f[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker1][1] \n" - "vmlal.s16 q13, d21, %f[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker1][2] \n" - "vmlal.s16 q13, d21, %f[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker1][3] \n" - "vmlal.s16 q13, d21, %f[ker1][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmovl.s8 q9, d13 \n" - "vmlal.s16 q12, d16, %e[ker3][0] \n" - "vmlal.s16 q13, d17, %e[ker3][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker2][0] \n" - "vmlal.s16 q13, d21, %e[ker2][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker2][1] \n" - "vmlal.s16 q13, d21, %e[ker2][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker2][2] \n" - "vmlal.s16 q13, d21, %e[ker2][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker2][3] \n" - "vmlal.s16 q13, d21, %e[ker2][3] \n" - - // restore output - "vst1.32 {q12-q13}, [%[output_ptr0]]! \n" - "subs %[loop], #1 \n" - "bne loop_1h8w_%= \n" - - "start_remain4_%=: \n" - "cmp %[remain], #4 \n" - "blt start_remain_%= \n" - "mov r0, #4 \n" - "vld1.s8 {d10}, [%[input_ptr0]], r0 \n" - "vld1.s8 {d12}, [%[input_ptr1]], r0 \n" - "vld1.s8 {d14}, [%[input_ptr2]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmull.s16 q12, d16, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker0][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmlal.s16 q12, d16, %f[ker2][1] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker0][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmlal.s16 q12, d16, %f[ker2][2] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker1][3] \n" - - "vld1.s8 {d10}, [%[input_ptr3]], r0 \n" - "vld1.s8 {d12}, [%[input_ptr4]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmlal.s16 q12, d16, %f[ker2][3] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker1][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmlal.s16 q12, d16, %e[ker3][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker2][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker2][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker2][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker2][3] \n" - - // restore output - "vst1.32 {d24-d25}, [%[output_ptr0]]! \n" - "sub %[remain], #4 \n" - - "start_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - "mov r0, %[remain] \n" - "vld1.s8 {d10}, [%[input_ptr0]], r0 \n" - "vld1.s8 {d12}, [%[input_ptr1]], r0 \n" - "vld1.s8 {d14}, [%[input_ptr2]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmull.s16 q12, d16, %f[ker2][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker0][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmlal.s16 q12, d16, %f[ker2][1] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker0][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker0][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker0][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker0][3] \n" - - "vmovl.s8 q8, d14 \n" - "vmlal.s16 q12, d16, %f[ker2][2] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker1][3] \n" - - "vld1.s8 {d10}, [%[input_ptr3]], r0 \n" - "vld1.s8 {d12}, [%[input_ptr4]], r0 \n" - "vmovl.s8 q8, d10 \n" - "vmlal.s16 q12, d16, %f[ker2][3] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %f[ker1][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %f[ker1][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %f[ker1][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %f[ker1][3] \n" - - "vmovl.s8 q8, d12 \n" - "vmlal.s16 q12, d16, %e[ker3][0] \n" - "vext.s16 q10, q8, q9, #1 \n" - "vmlal.s16 q12, d20, %e[ker2][0] \n" - "vext.s16 q10, q8, q9, #2 \n" - "vmlal.s16 q12, d20, %e[ker2][1] \n" - "vext.s16 q10, q8, q9, #3 \n" - "vmlal.s16 q12, d20, %e[ker2][2] \n" - "vext.s16 q10, q8, q9, #4 \n" - "vmlal.s16 q12, d20, %e[ker2][3] \n" - - "cmp %[remain], #2 \n" - "blt store_1h1w_%= \n" - "vst1.32 {d24}, [%[output_ptr0]]! \n" - "cmp %[remain], #3 \n" - "blt end_%= \n" - "vst1.32 {d25[0]}, [%[output_ptr0]]! \n" - "b end_%= \n" - - "store_1h1w_%=: \n" - "vst1.32 {d24[0]}, [%[output_ptr0]]! \n" - "end_%=: \n" - : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), - [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), - [input_ptr4] "+r"(input_ptr4), [output_ptr0] "+r"(output_ptr0), - [loop] "+r"(loop), [remain] "+r"(w_remain) - : [ker0] "w"(_ker0), [ker1] "w"(_ker1), [ker2] "w"(_ker2), - [ker3] "w"(_ker3) - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15", "r0"); - // pad right - if (padding_w) { - int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); - int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1))); - int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2))); - int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3))); - int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4))); - int16x4_t zero = vdup_n_s16(0); - int32x4_t acc; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 5 - (padding_w + input_w); - if (padding >= 5) { - *output_ptr0 = 0; - } else { - int iw = w - valid_w_end; - int32_t sum0 = input_ptr0[iw] * filter_ptr0[0] + - input_ptr1[iw] * filter_ptr1[0] + - input_ptr2[iw] * filter_ptr2[0] + - input_ptr3[iw] * filter_ptr3[0] + - input_ptr4[iw] * filter_ptr4[0]; - row0 = vext_s16(row0, zero, 1); - row1 = vext_s16(row1, zero, 1); - row2 = vext_s16(row2, zero, 1); - row3 = vext_s16(row3, zero, 1); - row4 = vext_s16(row4, zero, 1); - acc = vmull_s16(row0, _ker[0]); - acc = vmlal_s16(acc, row1, _ker[1]); - acc = vmlal_s16(acc, row2, _ker[2]); - acc = vmlal_s16(acc, row3, _ker[3]); - acc = vmlal_s16(acc, row4, _ker[4]); - int32x2_t sum = vpadd_s32(vget_low_s32(acc), vget_high_s32(acc)); - sum = vpadd_s32(sum, sum); - sum0 += vget_lane_s32(sum, 0); - *output_ptr0 = sum0; - } - output_ptr0++; - } - } - } - // pad bottom - for (int h = valid_h_end; h < output_h; ++h) { - DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, - input_w, padding_h, padding_w, output_w, - output_ptr, _ker, kernel); - } - } -} - -template <> -void DepthwiseConv5x5S2(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) {} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // __ARM_NEON__ diff --git a/mobile/src/operators/math/element_wise.h b/mobile/src/operators/math/element_wise.h deleted file mode 100644 index f81931930f..0000000000 --- a/mobile/src/operators/math/element_wise.h +++ /dev/null @@ -1,396 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "framework/tensor.h" -#include "operators/math/activation.h" -#ifdef __ARM_NEON -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -void AddChannelWise(const framework::Tensor *input, - const framework::Tensor *bias, framework::Tensor *output) { - const float *input_ptr = input->data(); - const float *bias_ptr = bias->data(); - float *output_ptr = output->mutable_data(); - // maybe check shape - int batch_size = input->dims()[0]; - int channels = input->dims()[1]; - int spatial_size = input->dims()[2] * input->dims()[3]; - - for (int batch = 0; batch < batch_size; ++batch) { - for (int channel = 0; channel < channels; ++channel) { - size_t offset = (batch * channels + channel) * spatial_size; - const float *x = input_ptr + offset; - float *y = output_ptr + offset; - float beta = bias_ptr[channel]; - int j = 0; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - float32x4_t __bias = vdupq_n_f32(beta); - for (; j < spatial_size - 15; j += 16, x += 16, y += 16) { - float32x4_t in0 = vld1q_f32(x); - float32x4_t in1 = vld1q_f32(x + 4); - float32x4_t in2 = vld1q_f32(x + 8); - float32x4_t in3 = vld1q_f32(x + 12); - in0 = vaddq_f32(__bias, in0); - in1 = vaddq_f32(__bias, in1); - in2 = vaddq_f32(__bias, in2); - in3 = vaddq_f32(__bias, in3); - in0 = math::vActiveq_f32(in0); - in1 = math::vActiveq_f32(in1); - in2 = math::vActiveq_f32(in2); - in3 = math::vActiveq_f32(in3); - vst1q_f32(y, in0); - vst1q_f32(y + 4, in1); - vst1q_f32(y + 8, in2); - vst1q_f32(y + 12, in3); - } - for (; j < spatial_size - 3; j += 4, x += 4, y += 4) { - float32x4_t in0 = vld1q_f32(x); - in0 = vaddq_f32(__bias, in0); - in0 = math::vActiveq_f32(in0); - vst1q_f32(y, in0); - } -#endif - for (; j < spatial_size; ++j, ++x, ++y) { - *y = math::Active((*x) + beta); - } - } - } -} - -template -void ScaleAddChannelWise(const framework::Tensor *input, - const framework::Tensor *scale, - const framework::Tensor *bias, - framework::Tensor *output) { - const float *input_ptr = input->data(); - const float *scale_ptr = scale->data(); - const float *bias_ptr = bias->data(); - float *output_ptr = output->mutable_data(); - // maybe check shape - int batch_size = input->dims()[0]; - int channels = input->dims()[1]; - int spatial_size = input->dims()[2] * input->dims()[3]; - - for (int batch = 0; batch < batch_size; ++batch) { - for (int channel = 0; channel < channels; ++channel) { - size_t offset = (batch * channels + channel) * spatial_size; - const float *x = input_ptr + offset; - float *y = output_ptr + offset; - float alpha = scale_ptr[channel]; - float beta = bias_ptr[channel]; - int j = 0; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - float32x4_t __scale = vdupq_n_f32(alpha); - float32x4_t __bias = vdupq_n_f32(beta); - for (; j < spatial_size - 15; j += 16, x += 16, y += 16) { - float32x4_t in0 = vld1q_f32(x); - float32x4_t in1 = vld1q_f32(x + 4); - float32x4_t in2 = vld1q_f32(x + 8); - float32x4_t in3 = vld1q_f32(x + 12); - in0 = vmlaq_f32(__bias, __scale, in0); - in1 = vmlaq_f32(__bias, __scale, in1); - in2 = vmlaq_f32(__bias, __scale, in2); - in3 = vmlaq_f32(__bias, __scale, in3); - in0 = math::vActiveq_f32(in0); - in1 = math::vActiveq_f32(in1); - in2 = math::vActiveq_f32(in2); - in3 = math::vActiveq_f32(in3); - vst1q_f32(y, in0); - vst1q_f32(y + 4, in1); - vst1q_f32(y + 8, in2); - vst1q_f32(y + 12, in3); - } - for (; j < spatial_size - 3; j += 4, x += 4, y += 4) { - float32x4_t in0 = vld1q_f32(x); - in0 = vmlaq_f32(__bias, __scale, in0); - in0 = math::vActiveq_f32(in0); - vst1q_f32(y, in0); - } -#endif - for (; j < spatial_size; ++j, ++x, ++y) { - *y = math::Active(alpha * (*x) + beta); - } - } - } -} - -template -void ScaleAddChannelWise(const framework::Tensor *input, - const framework::Tensor *scale, - const framework::Tensor *bias, - const framework::Tensor *tensorwise_bias, - framework::Tensor *output) { - const float *input_ptr = input->data(); - const float *scale_ptr = scale->data(); - const float *bias_ptr = bias->data(); - const float *tensorwise_bias_ptr = tensorwise_bias->data(); - float *output_ptr = output->mutable_data(); - // maybe check shape - int batch_size = input->dims()[0]; - int channels = input->dims()[1]; - int spatial_size = input->dims()[2] * input->dims()[3]; - - for (int batch = 0; batch < batch_size; ++batch) { - for (int channel = 0; channel < channels; ++channel) { - size_t offset = (batch * channels + channel) * spatial_size; - const float *x = input_ptr + offset; - const float *b = tensorwise_bias_ptr + offset; - float *y = output_ptr + offset; - float alpha = scale_ptr[channel]; - float beta = bias_ptr[channel]; - int j = 0; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - float32x4_t __scale = vdupq_n_f32(alpha); - float32x4_t __bias = vdupq_n_f32(beta); - for (; j < spatial_size - 15; j += 16, x += 16, b += 16, y += 16) { - float32x4_t in0 = vld1q_f32(x); - float32x4_t in1 = vld1q_f32(x + 4); - float32x4_t in2 = vld1q_f32(x + 8); - float32x4_t in3 = vld1q_f32(x + 12); - float32x4_t b0 = vld1q_f32(b); - float32x4_t b1 = vld1q_f32(b + 4); - float32x4_t b2 = vld1q_f32(b + 8); - float32x4_t b3 = vld1q_f32(b + 12); - in0 = vmlaq_f32(__bias, __scale, in0); - in1 = vmlaq_f32(__bias, __scale, in1); - in2 = vmlaq_f32(__bias, __scale, in2); - in3 = vmlaq_f32(__bias, __scale, in3); - in0 = vaddq_f32(in0, b0); - in1 = vaddq_f32(in1, b1); - in2 = vaddq_f32(in2, b2); - in3 = vaddq_f32(in3, b3); - in0 = math::vActiveq_f32(in0); - in1 = math::vActiveq_f32(in1); - in2 = math::vActiveq_f32(in2); - in3 = math::vActiveq_f32(in3); - vst1q_f32(y, in0); - vst1q_f32(y + 4, in1); - vst1q_f32(y + 8, in2); - vst1q_f32(y + 12, in3); - } - for (; j < spatial_size - 3; j += 4, x += 4, b += 4, y += 4) { - float32x4_t in0 = vld1q_f32(x); - float32x4_t b0 = vld1q_f32(b); - in0 = vmlaq_f32(__bias, __scale, in0); - in0 = vaddq_f32(in0, b0); - in0 = math::vActiveq_f32(in0); - vst1q_f32(y, in0); - } -#endif - for (; j < spatial_size; ++j, ++x, ++b, ++y) { - *y = math::Active(alpha * (*x) + beta + (*b)); - } - } - } -} - -template -void AddElememtWise(const framework::Tensor *input, - const framework::Tensor *bias, const int axis, - framework::Tensor *output) { - const auto &x_dims = input->dims(); - const auto &y_dims = bias->dims(); - const float *input_data = input->data(); - const float *bias_data = bias->data(); - float *output_data = output->mutable_data(); - - if (x_dims == y_dims) { - size_t channels = 1; - size_t elementwise_num = 1; - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } -#pragma omp parallel for - for (int j = 0; j < channels; ++j) { - size_t offset = (0 * channels + j) * elementwise_num; - const float *input = input_data + offset; - const float bias = bias_data[j]; - float *output = output_data + offset; -#if 0 - int loop = elementwise_num >> 0x4; - int remain = elementwise_num & 0xF; - float32x4_t rb = vdupq_n_f32(bias); - for (int k = 0; k < loop; ++k) { - float32x4_t r0 = vld1q_f32(input); - float32x4_t r1 = vld1q_f32(input + 4); - float32x4_t r2 = vld1q_f32(input + 8); - float32x4_t r3 = vld1q_f32(input + 12); - r0 = vaddq_f32(r0, rb); - r1 = vaddq_f32(r1, rb); - r2 = vaddq_f32(r2, rb); - r3 = vaddq_f32(r3, rb); - r0 = math::vActiveq_f32(r0); - r1 = math::vActiveq_f32(r1); - r2 = math::vActiveq_f32(r2); - r3 = math::vActiveq_f32(r3); - vst1q_f32(output, r0); - vst1q_f32(output + 4, r1); - vst1q_f32(output + 8, r2); - vst1q_f32(output + 12, r3); - input += 16; - output += 16; - } - if (remain >= 8) { - float32x4_t r0 = vld1q_f32(input); - float32x4_t r1 = vld1q_f32(input + 4); - r0 = vaddq_f32(r0, rb); - r1 = vaddq_f32(r1, rb); - r0 = math::vActiveq_f32(r0); - r1 = math::vActiveq_f32(r1); - vst1q_f32(output, r0); - vst1q_f32(output + 4, r1); - input += 8; - output += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t r0 = vld1q_f32(input); - r0 = vaddq_f32(r0, rb); - r0 = math::vActiveq_f32(r0); - vst1q_f32(output, r0); - input += 4; - output += 4; - remain -= 4; - } - if (remain > 0) { - float32x4_t r0 = vld1q_f32(input); - r0 = vaddq_f32(r0, rb); - r0 = math::vActiveq_f32(r0); - switch (remain) { - case 1: - vst1q_lane_f32(output, r0, 0); - break; - case 2: - vst1_f32(output, vget_low_f32(r0)); - break; - case 3: - vst1_f32(output, vget_low_f32(r0)); - vst1q_lane_f32(output, r0, 2); - break; - } - } -#else - for (int k = 0; k < elementwise_num; ++k) { - output[k] = math::Active(input[k] + bias); - } -#endif // __ARM_NEON__ - } - - } else { - // axis = -1 represent the last dimensions. - int dim = (axis == -1 ? x_dims.size() - y_dims.size() : axis); - size_t batch = 1; - size_t channels = 1; - size_t elementwise_num = 1; - for (int i = 0; i < dim; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + dim; i < x_dims.size(); ++i) { - elementwise_num *= x_dims[i]; - } - -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - size_t offset = (i * channels + j) * elementwise_num; - const float *input = input_data + offset; - const float bias = bias_data[j]; - float *output = output_data + offset; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - int loop = elementwise_num >> 0x4; - int remain = elementwise_num & 0xF; - float32x4_t rb = vdupq_n_f32(bias); - for (int k = 0; k < loop; ++k) { - float32x4_t r0 = vld1q_f32(input); - float32x4_t r1 = vld1q_f32(input + 4); - float32x4_t r2 = vld1q_f32(input + 8); - float32x4_t r3 = vld1q_f32(input + 12); - r0 = vaddq_f32(r0, rb); - r1 = vaddq_f32(r1, rb); - r2 = vaddq_f32(r2, rb); - r3 = vaddq_f32(r3, rb); - r0 = math::vActiveq_f32(r0); - r1 = math::vActiveq_f32(r1); - r2 = math::vActiveq_f32(r2); - r3 = math::vActiveq_f32(r3); - vst1q_f32(output, r0); - vst1q_f32(output + 4, r1); - vst1q_f32(output + 8, r2); - vst1q_f32(output + 12, r3); - input += 16; - output += 16; - } - if (remain >= 8) { - float32x4_t r0 = vld1q_f32(input); - float32x4_t r1 = vld1q_f32(input + 4); - r0 = vaddq_f32(r0, rb); - r1 = vaddq_f32(r1, rb); - r0 = math::vActiveq_f32(r0); - r1 = math::vActiveq_f32(r1); - vst1q_f32(output, r0); - vst1q_f32(output + 4, r1); - input += 8; - output += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t r0 = vld1q_f32(input); - r0 = vaddq_f32(r0, rb); - r0 = math::vActiveq_f32(r0); - vst1q_f32(output, r0); - input += 4; - output += 4; - remain -= 4; - } - if (remain > 0) { - float32x4_t r0 = vld1q_f32(input); - r0 = vaddq_f32(r0, rb); - r0 = math::vActiveq_f32(r0); - switch (remain) { - case 1: - vst1q_lane_f32(output, r0, 0); - break; - case 2: - vst1_f32(output, vget_low_f32(r0)); - break; - case 3: - vst1_f32(output, vget_low_f32(r0)); - vst1q_lane_f32(output, r0, 2); - break; - } - } -#else - for (int k = 0; k < elementwise_num; ++k) { - output[k] = math::Active(input[k] + bias); - } -#endif // __ARM_NEON__ - } - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/elementwise_op_function.h b/mobile/src/operators/math/elementwise_op_function.h deleted file mode 100644 index 95fd037988..0000000000 --- a/mobile/src/operators/math/elementwise_op_function.h +++ /dev/null @@ -1,178 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "transform.h" - -#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) - -namespace paddle_mobile { -namespace operators { - -/* - * Out = X ⊙ Y - * If Y's shape does not match X' shape, they will be reshaped. - * For example: - * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 - * pre=2, n=3*4, post=5 - * x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5) - * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) - * pre=2*3, n=4*5, post=1 - * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) - */ -inline void get_mid_dims(const framework::DDim &x_dims, - const framework::DDim &y_dims, const int axis, - int *pre, int *n, int *post) { - *pre = 1; - *n = 1; - *post = 1; - // compute pre - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - - for (int i = 0; i < y_dims.size(); ++i) { - assert(x_dims[i + axis] == y_dims[i]); - /// "Broadcast dimension mismatch."); - (*n) *= y_dims[i]; - } - - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; - } -} - -/// remove dims tail 1. (4,20,1,1) -> (4,20) -inline void trim_trailing_singular_dims(framework::DDim *dims) { - // Remove trailing dimensions of size 1 for y - auto actual_dims_size = dims->size(); - for (; actual_dims_size != 0; --actual_dims_size) { - if ((*dims)[actual_dims_size - 1] != 1) break; - } - if (actual_dims_size != dims->size()) { - auto actual_dims = framework::vectorize(*dims); - actual_dims.resize(actual_dims_size); - *dims = framework::make_ddim(actual_dims); - } -} - -/// (4,20,2)+(20,): (20,) just as (20,1), when move 2 strides in last -/// dimension -/// in (4,20,2) is 2 , -/// (20,1) move 1 stride , to fill(add) 2 element with the same number. -template -class MidWiseTransformIterator { - public: - MidWiseTransformIterator(const T *ptr, int n, int post) - : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} - - MidWiseTransformIterator &operator++() { - if (post_ != 1) { - ++j_; - if (UNLIKELY(j_ == post_)) { - ++i_; - j_ = 0; - if (UNLIKELY(i_ == n_)) { - i_ = 0; - } - } - return *this; - } else { - ++i_; - if (UNLIKELY(i_ == n_)) { - i_ = 0; - } - return *this; - } - } - - bool operator==(const MidWiseTransformIterator &rhs) const { - return (ptr_ + i_) == &(*rhs); - } - - bool operator!=(const MidWiseTransformIterator &rhs) const { - return (ptr_ + i_) != &(*rhs); - } - - const T &operator*() { return ptr_[i_]; } - - private: - const T *ptr_; - int64_t i_; - int64_t j_; - int64_t n_; - int64_t post_; -}; - -template -class TransformFunctor { - public: - TransformFunctor(const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z, Functor func) - : x_(x->data()), - y_(y->data()), - z_(z->mutable_data()), - nx_(x->numel()), - func_(func) {} - - inline void Run() const { - math::Transform trans; - // 同时执行func(x_, y_)传入z_。 - trans(x_, x_ + nx_, y_, z_, func_); - } - - inline void RunMidWise(int n, int pre, int post) const { - math::Transform trans; - trans(x_, x_ + nx_, MidWiseTransformIterator(y_, n, post), z_, func_); - } - - private: - const T *x_; - const T *y_; - OutType *z_; - int64_t nx_; - Functor func_; -}; - -template -void ElementwiseComputeEx(const framework::Tensor *x, - const framework::Tensor *y, int axis, Functor func, - framework::Tensor *z) { - TransformFunctor functor(x, y, z, func); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - PADDLE_MOBILE_ENFORCE(x_dims.size() >= y_dims.size(), - "Rank of first input must >= rank of second input."); - - if (x_dims == y_dims) { - functor.Run(); - return; - } - - /// axis = -1 represent the last dimensions. - axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); - PADDLE_MOBILE_ENFORCE(axis >= 0 && axis < x_dims.size(), - "Axis should be in range [0, x_dims)"); - trim_trailing_singular_dims(&y_dims); - axis = (y_dims.size() == 0) ? x_dims.size() : axis; - - int pre, n, post; - get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); - - functor.RunMidWise(n, pre, post); -} - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/gemm.cpp b/mobile/src/operators/math/gemm.cpp deleted file mode 100644 index 1fa78d1616..0000000000 --- a/mobile/src/operators/math/gemm.cpp +++ /dev/null @@ -1,3807 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/math/gemm.h" -#include -#include "common/log.h" -#include "memory/t_malloc.h" -#if __ARM_NEON -#include -#endif -#ifdef _OPENMP -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { - -#if __ARM_NEON -inline float32x4_t vandq_f32(float32x4_t x, uint32x4_t mask) { - return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); -} -#endif - -void Gemm::PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, - float *buffer, const bool parallel) { - uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5}; - int remain_k = k & 0x3; - uint32x4_t vzero = vdupq_n_u32(0); - uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k)); - - #pragma omp parallel for if (parallel) - for (int i = 0; i < m - 5; i += 6) { - const float *a0 = A + i * lda; - const float *a1 = A + (i + 1) * lda; - const float *a2 = A + (i + 2) * lda; - const float *a3 = A + (i + 3) * lda; - const float *a4 = A + (i + 4) * lda; - const float *a5 = A + (i + 5) * lda; - float *out_ptr = buffer + i * k; - - int loops = k >> 2; - if (loops > 0) { -#if __aarch64__ - for (int l = 0; l < loops; ++l) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = - vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - _d3 = - vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1])); - - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); - vst1q_f32(out_ptr + 18, _d3); - vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1])); - - a0 += 4; - a1 += 4; - a2 += 4; - a3 += 4; - a4 += 4; - a5 += 4; - out_ptr += 24; - } -#else - asm volatile( - "loop_4k_%=: \n" - "vld1.32 {d0-d1}, [%[a0]]! \n" - "vld1.32 {d2-d3}, [%[a1]]! \n" - "vld1.32 {d4-d5}, [%[a2]]! \n" - "vld1.32 {d6-d7}, [%[a3]]! \n" - "vld1.32 {d8-d9}, [%[a4]]! \n" - "vld1.32 {d10-d11}, [%[a5]]! \n" - "vtrn.32 q0, q1 \n" - "vtrn.32 q2, q3 \n" - "vtrn.32 q4, q5 \n" - "vswp.32 d1, d4 \n" - "vswp.32 d3, d6 \n" - - "vst1.32 {q0}, [%[out]]! \n" - "vst1.32 {d8}, [%[out]]! \n" - "vst1.32 {q1}, [%[out]]! \n" - "vst1.32 {d10}, [%[out]]! \n" - "vst1.32 {q2}, [%[out]]! \n" - "vst1.32 {d9}, [%[out]]! \n" - "vst1.32 {q3}, [%[out]]! \n" - "vst1.32 {d11}, [%[out]]! \n" - - "subs %[loops], #1 \n" - "bne loop_4k_%= \n" - : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2), - [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); -#endif - } - - if (remain_k > 0) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - _d0 = vandq_f32(_d0, vmask1); - _d1 = vandq_f32(_d1, vmask1); - _d2 = vandq_f32(_d2, vmask1); - _d3 = vandq_f32(_d3, vmask1); - _d4 = vandq_f32(_d4, vmask1); - _d5 = vandq_f32(_d5, vmask1); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - - switch (remain_k) { - case 3: - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); - case 2: - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); - case 1: - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); - default: - break; - } - } - } - - int remain_m = m % 6; - if (remain_m) { - int remain_m_start = m - remain_m; - const float *a0 = A + remain_m_start * lda; - const float *a1 = a0 + lda; - const float *a2 = a0 + 2 * lda; - const float *a3 = a0 + 3 * lda; - const float *a4 = a0 + 4 * lda; - const float *a5 = a0 + 5 * lda; - float *out_ptr = buffer + remain_m_start * k; - - uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m)); - uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m)); - - int loops = k >> 2; - if (loops > 0) { -#if __aarch64__ - for (int l = 0; l < loops; ++l) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = - vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - _d3 = - vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1])); - - _d0 = vandq_f32(_d0, vmask2); - _d1 = vandq_f32(_d1, vmask2); - _d2 = vandq_f32(_d2, vmask2); - _d3 = vandq_f32(_d3, vmask2); - _d4 = vandq_f32(_q3.val[0], vmask3); - _d5 = vandq_f32(_q3.val[1], vmask3); - - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_d4)); - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_d5)); - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_d4)); - vst1q_f32(out_ptr + 18, _d3); - vst1_f32(out_ptr + 22, vget_high_f32(_d5)); - - a0 += 4; - a1 += 4; - a2 += 4; - a3 += 4; - a4 += 4; - a5 += 4; - out_ptr += 24; - } -#else - asm volatile( - "loop_4k_%=: \n" - "vld1.32 {d0-d1}, [%[a0]]! \n" - "vld1.32 {d2-d3}, [%[a1]]! \n" - "vld1.32 {d4-d5}, [%[a2]]! \n" - "vld1.32 {d6-d7}, [%[a3]]! \n" - "vld1.32 {d8-d9}, [%[a4]]! \n" - "vld1.32 {d10-d11}, [%[a5]]! \n" - "vtrn.32 q0, q1 \n" - "vtrn.32 q2, q3 \n" - "vtrn.32 q4, q5 \n" - "vswp.32 d1, d4 \n" - "vswp.32 d3, d6 \n" - - "vbif q0, %q[vzero], %q[vmask2] \n" - "vbif q1, %q[vzero], %q[vmask2] \n" - "vbif q2, %q[vzero], %q[vmask2] \n" - "vbif q3, %q[vzero], %q[vmask2] \n" - "vbif q4, %q[vzero], %q[vmask3] \n" - "vbif q5, %q[vzero], %q[vmask3] \n" - - "vst1.32 {q0}, [%[out]]! \n" - "vst1.32 {d8}, [%[out]]! \n" - "vst1.32 {q1}, [%[out]]! \n" - "vst1.32 {d10}, [%[out]]! \n" - "vst1.32 {q2}, [%[out]]! \n" - "vst1.32 {d9}, [%[out]]! \n" - "vst1.32 {q3}, [%[out]]! \n" - "vst1.32 {d11}, [%[out]]! \n" - - "subs %[loops], #1 \n" - "bne loop_4k_%= \n" - : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2), - [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) - : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); -#endif - } - - if (remain_k > 0) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - _d0 = vandq_f32(_d0, vmask1); - _d1 = vandq_f32(_d1, vmask1); - _d2 = vandq_f32(_d2, vmask1); - _d3 = vandq_f32(_d3, vmask1); - _d4 = vandq_f32(_d4, vmask1); - _d5 = vandq_f32(_d5, vmask1); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), - // vget_high_f32(_q1.val[1])); - - _d0 = vandq_f32(_d0, vmask2); - _d1 = vandq_f32(_d1, vmask2); - _d2 = vandq_f32(_d2, vmask2); - // _d3 = vandq_f32(_d3, vmask2); - _d4 = vandq_f32(_q3.val[0], vmask3); - _d5 = vandq_f32(_q3.val[1], vmask3); - - switch (remain_k) { - case 3: - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_d4)); - case 2: - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_d5)); - case 1: - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_d4)); - default: - break; - } - } - } -} - -// 将B矩阵分块复制到连续内存(RowMajor) -void Gemm::PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, - float *buffer, const bool parallel) { - const int j_length = n - n_tail; - - #pragma omp parallel for if (parallel) - for (int i = 0; i < k; ++i) { - int j = 0; - for (; j < j_length - 31; j += 32) { - float *local_buffer0 = buffer + j * k + i * NR; - float *local_buffer1 = buffer + (j + 8) * k + i * NR; - float *local_buffer2 = buffer + (j + 16) * k + i * NR; - float *local_buffer3 = buffer + (j + 24) * k + i * NR; - const float *b0 = B + i * ldb + j; -#if __aarch64__ - asm volatile( - "prfm pldl1keep, [%[b0]] \n" - "ld1 {v0.4s, v1.4s}, [%[b0]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[b0]], #32 \n" - "ld1 {v4.4s, v5.4s}, [%[b0]], #32 \n" - "ld1 {v6.4s, v7.4s}, [%[b0]] \n" - "st1 {v0.4s, v1.4s}, [%[local_buffer0]], #32 \n" - "st1 {v2.4s, v3.4s}, [%[local_buffer1]], #32 \n" - "st1 {v4.4s, v5.4s}, [%[local_buffer2]], #32 \n" - "st1 {v6.4s, v7.4s}, [%[local_buffer3]], #32 \n" - : [local_buffer0] "+r"(local_buffer0), - [local_buffer1] "+r"(local_buffer1), - [local_buffer2] "+r"(local_buffer2), - [local_buffer3] "+r"(local_buffer3), [b0] "+r"(b0) - : - : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -#else - asm volatile( - // "pld [%[b]] \n" - "vld1.32 {q0, q1}, [%[b0]]! \n" - "vld1.32 {q2, q3}, [%[b0]]! \n" - "vld1.32 {q4, q5}, [%[b0]]! \n" - "vld1.32 {q6, q7}, [%[b0]]! \n" - "vst1.32 {q0, q1}, [%[local_buffer0]]! \n" - "vst1.32 {q2, q3}, [%[local_buffer1]]! \n" - "vst1.32 {q4, q5}, [%[local_buffer2]]! \n" - "vst1.32 {q6, q7}, [%[local_buffer3]]! \n" - : [local_buffer0] "+r"(local_buffer0), - [local_buffer1] "+r"(local_buffer1), - [local_buffer2] "+r"(local_buffer2), - [local_buffer3] "+r"(local_buffer3), [b0] "+r"(b0) - : - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -#endif // __aarch64__ - } - for (; j < j_length - 15; j += 16) { - float *local_buffer0 = buffer + j * k + i * NR; - float *local_buffer1 = buffer + (j + 8) * k + i * NR; - const float *b0 = &B(i, j); -#if __ARM_NEON -#if __aarch64__ - asm volatile( - "prfm pldl1keep, [%[b0]] \n" - "ld1 {v0.4s, v1.4s}, [%[b0]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[b0]] \n" - "st1 {v0.4s, v1.4s}, [%[local_buffer0]], #32 \n" - "st1 {v2.4s, v3.4s}, [%[local_buffer1]], #32 \n" - : [local_buffer0] "+r"(local_buffer0), - [local_buffer1] "+r"(local_buffer1), [b0] "+r"(b0) - : - : "memory", "v0", "v1", "v2", "v3"); -#else - asm volatile( - // "pld [%[b0]] \n" - "vld1.32 {q0, q1}, [%[b0]]! \n" - "vld1.32 {q2, q3}, [%[b0]] \n" - "vst1.32 {q0, q1}, [%[local_buffer0]]! \n" - "vst1.32 {q2, q3}, [%[local_buffer1]]! \n" - : [local_buffer0] "+r"(local_buffer0), - [local_buffer1] "+r"(local_buffer1), [b0] "+r"(b0) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#endif // __ARM_NEON - } - for (; j < j_length; j += NR) { - float *local_buffer = buffer + j * k + i * NR; - const float *b0 = &B(i, j); -#if __aarch64__ - asm volatile( - "prfm pldl1keep, [%[b0]] \n" - "ld1 {v0.4s, v1.4s}, [%[b0]] \n" - "st1 {v0.4s, v1.4s}, [%[local_buffer]], #32 \n" - : [local_buffer] "+r"(local_buffer) - : [b0] "r"(b0) - : "memory", "v0", "v1"); -#else - asm volatile( - // "pld [%[b]] \n" - "vld1.32 {q0, q1}, [%[b0]] \n" - "vst1.32 {q0, q1}, [%[local_buffer]] \n" - : [local_buffer] "+r"(local_buffer) - : [b0] "r"(b0) - : "memory", "q0", "q1"); -#endif // __aarch64__ - } - } - if (n_tail != 0) { - uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - uint32x4_t vzero = vdupq_n_u32(0); - uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(n_tail)); - uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(n_tail)); - - float *local_buffer = buffer + j_length * k; - for (int i = 0; i < k; ++i) { - const float *b0 = &B(i, j_length); -#if __aarch64__ - asm volatile( - "prfm pldl1keep, [%[b0]] \n" - "ld1 {v0.4s, v1.4s}, [%[b0]] \n" - "BIF v0.8b, %[vzero].8b, %[vmask1].8b \n" - "BIF v1.8b, %[vzero].8b, %[vmask2].8b \n" - "st1 {v0.4s, v1.4s}, [%[local_buffer]], #32 \n" - : [local_buffer] "+r"(local_buffer) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero), - [b0] "r"(b0) - : "memory", "v0", "v1"); -#else - asm volatile( - "vld1.32 {q0, q1}, [%[b0]] \n" - "vbif q0, %q[vzero], %q[vmask1] \n" - "vbif q1, %q[vzero], %q[vmask2] \n" - "vst1.32 {q0, q1}, [%[local_buffer]]! \n" - : [local_buffer] "+r"(local_buffer) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero), - [b0] "r"(b0) - : "memory", "q0", "q1"); -#endif - } - } -} - -#if __ARM_NEON -#if __aarch64__ -void Gemm::PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, - float *buffer, const bool parallel) { - const int j_length = n - n_tail; - - #pragma omp parallel for if (parallel) - for (int j = 0; j < j_length; j += NR) { - float *local_buffer = buffer + j * k; - for (int i = 0; i < k; ++i) { - const float *b0 = &B(i, j); - asm volatile( - "prfm pldl2keep, [%[b0], #64] \n\t" - "ld1 {v0.4s, v1.4s, v2.4s}, [%[b0]] \n\t" - "st1 {v0.4s, v1.4s, v2.4s}, [%[local_buffer]], #48 \n\t" - : [local_buffer] "+r"(local_buffer) - : [b0] "r"(b0) - : "memory", "v0", "v1", "v2"); - } - } - if (n_tail != 0) { - float *local_buffer = buffer + j_length * k; - for (int i = 0; i < k; ++i) { - const float *b0 = &B(i, j_length); - for (int j = j_length; j < n; ++j) { - *local_buffer++ = *b0++; - } - for (int j = n; j < j_length + NR; ++j) { - *local_buffer++ = 0; - } - } - } -} - -void Gemm::PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, - float *buffer, const bool parallel) { - const int j_length = n - n_tail; - - #pragma omp parallel for if (parallel) - for (int j = 0; j < n - n_tail; j += NR) { - float *local_buffer = buffer + j * k; - for (int i = 0; i < k; ++i) { - const float *b0 = &B(i, j); - asm volatile( - "prfm pldl2keep, [%[b0], #64] \n\t" - "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[b0]] \n\t" - "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[local_buffer]], #64 \n\t" - : [local_buffer] "+r"(local_buffer) - : [b0] "r"(b0) - : "memory", "v0", "v1", "v2", "v3"); - } - } - if (n_tail != 0) { - float *local_buffer = buffer + j_length * k; - for (int i = 0; i < k; ++i) { - const float *b0 = &B(i, j_length); - for (int j = j_length; j < n; ++j) { - *local_buffer++ = *b0++; - } - for (int j = n; j < j_length + NR; ++j) { - *local_buffer++ = 0; - } - } - } -} -#endif // __aarch64__ -#endif // __ARM_NEON - -// 分块矩阵乘法 -void Gemm::InnerKernel(int mc, int nc, float alpha, const float *a, - const float *b, float beta, float *c, float *C, int ldc, - bool relu) { -#pragma omp parallel for - for (int j = 0; j < nc; j += NR) { - for (int i = 0; i < mc; i += MR) { -#if __aarch64__ - // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#else - // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#endif - } - } - - if (alpha != 1) { - WriteWithAlphaBeta(mc, nc, c, C, ldc); - return; - } - if (beta == 0) { - WriteBasic(mc, nc, c, C, ldc); - return; - } - if (beta == 1 && !relu) { - WriteWithAdd(mc, nc, c, C, ldc); - return; - } - if (beta == 1 && relu) { - WriteWithAddRelu(mc, nc, c, C, ldc); - return; - } -} - -// 分块矩阵乘法 -void Gemm::InnerKernelWithBias(int mc, int nc, float alpha, const float *a, - const float *b, float beta, float *c, float *C, - int ldc, bool relu, float *bias) { -#pragma omp parallel for - for (int j = 0; j < nc; j += NR) { - for (int i = 0; i < mc; i += MR) { -#if __aarch64__ - // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#else - // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#endif - } - } - - if (alpha != 1) { - WriteWithAlphaBeta(mc, nc, c, C, ldc); - return; - } - if (beta == 0) { - WriteBasic(mc, nc, c, C, ldc); - return; - } - if (beta == 1 && !relu) { - if (bias == nullptr) { - WriteWithAdd(mc, nc, c, C, ldc); - } else { - WriteWithAddV1(mc, nc, c, C, ldc, bias); - } - return; - } - if (beta == 1 && relu) { - if (bias == nullptr) { - WriteWithAddRelu(mc, nc, c, C, ldc); - } else { - WriteWithAddReluV1(mc, nc, c, C, ldc, bias); - } - return; - } -} - -// 分块矩阵乘法 -void Gemm::InnerKernelWithBn(int mc, int nc, float alpha, const float *a, - const float *b, float beta, float *c, float *C, - int ldc, bool relu, float *new_scale, - float *new_bias) { -#pragma omp parallel for - for (int j = 0; j < nc; j += NR) { - for (int i = 0; i < mc; i += MR) { -#if __aarch64__ - // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#else - // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#endif - } - } - - if (relu) { - WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias); - } else { - WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias); - } -} - -// 分块矩阵乘法 -void Gemm::InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a, - const float *b, float beta, float *c, float *C, - int ldc, bool relu, float *new_scale, - float *new_bias, float *bias) { -#pragma omp parallel for - for (int j = 0; j < nc; j += NR) { - for (int i = 0; i < mc; i += MR) { -#if __aarch64__ - // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#else - // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#endif - } - } - WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias); -} - -void Gemm::InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, - float *c, float *C, int ldc, float *p, - std::string mode, float *bias, float *bias1) { -#pragma omp parallel for - for (int j = 0; j < nc; j += NR) { - for (int i = 0; i < mc; i += MR) { -#if __aarch64__ - // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#else - // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#endif - } - } - WriteWithAddPRelu(mc, nc, c, C, ldc, p, mode, bias, bias1); -} - -#if __ARM_NEON -#if __aarch64__ - -void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { - // init C - float32x4_t cv0 = vdupq_n_f32(0.0); - float32x4_t cv1 = vdupq_n_f32(0.0); - float32x4_t cv2 = vdupq_n_f32(0.0); - float32x4_t cv3 = vdupq_n_f32(0.0); - float32x4_t cv4 = vdupq_n_f32(0.0); - float32x4_t cv5 = vdupq_n_f32(0.0); - float32x4_t cv6 = vdupq_n_f32(0.0); - float32x4_t cv7 = vdupq_n_f32(0.0); - float32x4_t cv8 = vdupq_n_f32(0.0); - float32x4_t cv9 = vdupq_n_f32(0.0); - float32x4_t cv10 = vdupq_n_f32(0.0); - float32x4_t cv11 = vdupq_n_f32(0.0); - - float32x4_t av; - float32x4_t bv0; - float32x4_t bv1; - - float32x2_t av01; - float32x2_t av23; - float32x2_t av45; - - for (int p = 0; p < k; p += 1) { - av = vld1q_f32(a); - av01 = vget_low_f32(av); - av23 = vget_high_f32(av); - av45 = vld1_f32(a + 4); - bv0 = vld1q_f32(b); - bv1 = vld1q_f32(b + 4); - - cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0); - cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0); - cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1); - cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1); - - cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0); - cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0); - cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1); - cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1); - - cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0); - cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0); - cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1); - cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1); - - a += MR; - b += NR; - } - - vst1q_f32(c, cv0); - vst1q_f32(c + 4, cv1); - vst1q_f32(c + ldc, cv2); - vst1q_f32(c + ldc + 4, cv3); - vst1q_f32(c + 2 * ldc, cv4); - vst1q_f32(c + 2 * ldc + 4, cv5); - vst1q_f32(c + 3 * ldc, cv6); - vst1q_f32(c + 3 * ldc + 4, cv7); - vst1q_f32(c + 4 * ldc, cv8); - vst1q_f32(c + 4 * ldc + 4, cv9); - vst1q_f32(c + 5 * ldc, cv10); - vst1q_f32(c + 5 * ldc + 4, cv11); -} - -void Gemm::AddDot8x12(int k, const float *a, const float *b, float *c, - int ldc) { - const float *a_ptr, *b_ptr; - a_ptr = a; - b_ptr = b; - int kc1 = k; - int step = 4 * ldc; - asm volatile( - "dup v5.4s, wzr \n\t" - "dup v6.4s, wzr \n\t" - "dup v7.4s, wzr \n\t" - "dup v8.4s, wzr \n\t" - "dup v9.4s, wzr \n\t" - "dup v10.4s, wzr \n\t" - "dup v11.4s, wzr \n\t" - "dup v12.4s, wzr \n\t" - "dup v13.4s, wzr \n\t" - "dup v14.4s, wzr \n\t" - "dup v15.4s, wzr \n\t" - "dup v16.4s, wzr \n\t" - - "dup v17.4s, wzr \n\t" - "dup v18.4s, wzr \n\t" - "dup v19.4s, wzr \n\t" - "dup v20.4s, wzr \n\t" - "dup v21.4s, wzr \n\t" - "dup v22.4s, wzr \n\t" - "dup v23.4s, wzr \n\t" - "dup v24.4s, wzr \n\t" - "dup v25.4s, wzr \n\t" - "dup v26.4s, wzr \n\t" - "dup v27.4s, wzr \n\t" - "dup v28.4s, wzr \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "blt 2f \n\t" - "1: \n\t" - - "prfm pldl1keep, [%[a_ptr], #32] \n\t" - "prfm pldl1keep, [%[b_ptr], #48] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[a_ptr]], #32 \n\t" - "ld1 {v2.4s, v3.4s, v4.4s}, [%[b_ptr]], #48 \n\t" - - "fmla v5.4s, v2.4s, v0.s[0] \n\t" - "fmla v6.4s, v3.4s, v0.s[0] \n\t" - "fmla v7.4s, v4.4s, v0.s[0] \n\t" - "fmla v8.4s, v2.4s, v0.s[1] \n\t" - "fmla v9.4s, v3.4s, v0.s[1] \n\t" - "fmla v10.4s, v4.4s, v0.s[1] \n\t" - "fmla v11.4s, v2.4s, v0.s[2] \n\t" - "fmla v12.4s, v3.4s, v0.s[2] \n\t" - "fmla v13.4s, v4.4s, v0.s[2] \n\t" - "fmla v14.4s, v2.4s, v0.s[3] \n\t" - "fmla v15.4s, v3.4s, v0.s[3] \n\t" - "fmla v16.4s, v4.4s, v0.s[3] \n\t" - - "fmla v17.4s, v2.4s, v1.s[0] \n\t" - "fmla v18.4s, v3.4s, v1.s[0] \n\t" - "fmla v19.4s, v4.4s, v1.s[0] \n\t" - "fmla v20.4s, v2.4s, v1.s[1] \n\t" - "fmla v21.4s, v3.4s, v1.s[1] \n\t" - "fmla v22.4s, v4.4s, v1.s[1] \n\t" - "fmla v23.4s, v2.4s, v1.s[2] \n\t" - "fmla v24.4s, v3.4s, v1.s[2] \n\t" - "fmla v25.4s, v4.4s, v1.s[2] \n\t" - "fmla v26.4s, v2.4s, v1.s[3] \n\t" - "fmla v27.4s, v3.4s, v1.s[3] \n\t" - "fmla v28.4s, v4.4s, v1.s[3] \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "bge 1b \n\t" - "2: \n\t" - - "st1 {v5.4s, v6.4s, v7.4s}, [%[c]], %[step] \n\t" - "st1 {v8.4s, v9.4s, v10.4s}, [%[c]], %[step] \n\t" - "st1 {v11.4s, v12.4s, v13.4s}, [%[c]], %[step] \n\t" - "st1 {v14.4s, v15.4s, v16.4s}, [%[c]], %[step] \n\t" - "st1 {v17.4s, v18.4s, v19.4s}, [%[c]], %[step] \n\t" - "st1 {v20.4s, v21.4s, v22.4s}, [%[c]], %[step] \n\t" - "st1 {v23.4s, v24.4s, v25.4s}, [%[c]], %[step] \n\t" - "st1 {v26.4s, v27.4s, v28.4s}, [%[c]], %[step] \n\t" - : - : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), - [step] "r"(step) - : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", - "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); -} - -void Gemm::AddDot6x16(int k, const float *a, const float *b, float *c, - int ldc) { - const float *a_ptr, *b_ptr; - a_ptr = a; - b_ptr = b; - int kc1 = k; - int step = 4 * ldc; - int step1 = 4 * 6; - asm volatile( - - "dup v6.4s, wzr \n\t" - "dup v7.4s, wzr \n\t" - "dup v8.4s, wzr \n\t" - "dup v9.4s, wzr \n\t" - "dup v10.4s, wzr \n\t" - "dup v11.4s, wzr \n\t" - "dup v12.4s, wzr \n\t" - "dup v13.4s, wzr \n\t" - - "dup v14.4s, wzr \n\t" - "dup v15.4s, wzr \n\t" - "dup v16.4s, wzr \n\t" - "dup v17.4s, wzr \n\t" - "dup v18.4s, wzr \n\t" - "dup v19.4s, wzr \n\t" - "dup v20.4s, wzr \n\t" - "dup v21.4s, wzr \n\t" - - "dup v22.4s, wzr \n\t" - "dup v23.4s, wzr \n\t" - "dup v24.4s, wzr \n\t" - "dup v25.4s, wzr \n\t" - "dup v26.4s, wzr \n\t" - "dup v27.4s, wzr \n\t" - "dup v28.4s, wzr \n\t" - "dup v29.4s, wzr \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "blt 2f \n\t" - "1: \n\t" - - "prfm pldl1keep, [%[a_ptr], #24] \n\t" - "prfm pldl1keep, [%[b_ptr], #64] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[a_ptr]], %[step1] \n\t" - "ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [%[b_ptr]], #64 \n\t" - - "fmla v6.4s, v2.4s, v0.s[0] \n\t" - "fmla v7.4s, v3.4s, v0.s[0] \n\t" - "fmla v8.4s, v4.4s, v0.s[0] \n\t" - "fmla v9.4s, v5.4s, v0.s[0] \n\t" - - "fmla v10.4s, v2.4s, v0.s[1] \n\t" - "fmla v11.4s, v3.4s, v0.s[1] \n\t" - "fmla v12.4s, v4.4s, v0.s[1] \n\t" - "fmla v13.4s, v5.4s, v0.s[1] \n\t" - - "fmla v14.4s, v2.4s, v0.s[2] \n\t" - "fmla v15.4s, v3.4s, v0.s[2] \n\t" - "fmla v16.4s, v4.4s, v0.s[2] \n\t" - "fmla v17.4s, v5.4s, v0.s[2] \n\t" - - "fmla v18.4s, v2.4s, v0.s[3] \n\t" - "fmla v19.4s, v3.4s, v0.s[3] \n\t" - "fmla v20.4s, v4.4s, v0.s[3] \n\t" - "fmla v21.4s, v5.4s, v0.s[3] \n\t" - - "fmla v22.4s, v2.4s, v1.s[0] \n\t" - "fmla v23.4s, v3.4s, v1.s[0] \n\t" - "fmla v24.4s, v4.4s, v1.s[0] \n\t" - "fmla v25.4s, v5.4s, v1.s[0] \n\t" - - "fmla v26.4s, v2.4s, v1.s[1] \n\t" - "fmla v27.4s, v3.4s, v1.s[1] \n\t" - "fmla v28.4s, v4.4s, v1.s[1] \n\t" - "fmla v29.4s, v5.4s, v1.s[1] \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "bge 1b \n\t" - "2: \n\t" - - "st1 {v6.4s, v7.4s, v8.4s, v9.4s}, [%[c]], %[step] \n\t" - "st1 {v10.4s, v11.4s, v12.4s, v13.4s}, [%[c]], %[step] \n\t" - "st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [%[c]], %[step] \n\t" - "st1 {v18.4s, v19.4s, v20.4s, v21.4s}, [%[c]], %[step] \n\t" - "st1 {v22.4s, v23.4s, v24.4s, v25.4s}, [%[c]], %[step] \n\t" - "st1 {v26.4s, v27.4s, v28.4s, v29.4s}, [%[c]], %[step] \n\t" - : - : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), - [step] "r"(step), [step1] "r"(step1) - : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", - "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"); -} - -#else - -void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { - const float *a_ptr, *b_ptr; - a_ptr = a; - b_ptr = b; - int kc1 = k / 4; - int kc2 = k % 4; - int step = 4 * ldc; - asm volatile( - "pld [%[a_ptr]] \n\t" - "pld [%[b_ptr]] \n\t" - "vmov.f32 q10, #0.0 \n\t" - "vmov.f32 q11, #0.0 \n\t" - "vmov.f32 q12, #0.0 \n\t" - "vmov.f32 q13, #0.0 \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "blt end_kc1_%= \n\t" - "loop_kc1_%=: \n\t" - "pld [%[a_ptr], #64] \n\t" - "pld [%[b_ptr], #64] \n\t" - "vld1.32 {q0, q1}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q2, d0[1] \n\t" - "vmla.f32 q12, q2, d1[0] \n\t" - "vmla.f32 q13, q2, d1[1] \n\t" - "vmla.f32 q10, q3, d2[0] \n\t" - "vmla.f32 q11, q3, d2[1] \n\t" - "vmla.f32 q12, q3, d3[0] \n\t" - "vmla.f32 q13, q3, d3[1] \n\t" - "vld1.32 {q4, q5}, [%[a_ptr]]! \n\t" - "vld1.32 {q6, q7}, [%[b_ptr]]! \n\t" - "vmla.f32 q10, q6, d8[0] \n\t" - "vmla.f32 q11, q6, d8[1] \n\t" - "vmla.f32 q12, q6, d9[0] \n\t" - "vmla.f32 q13, q6, d9[1] \n\t" - "vmla.f32 q10, q7, d10[0] \n\t" - "vmla.f32 q11, q7, d10[1] \n\t" - "vmla.f32 q12, q7, d11[0] \n\t" - "vmla.f32 q13, q7, d11[1] \n\t" - "subs %[kc1], %[kc1], #1 \n\t" - "bge loop_kc1_%= \n\t" - "end_kc1_%=: \n\t" - - "subs %[kc2], %[kc2], #1 \n\t" - "blt end_kc2_%= \n\t" - "loop_kc2_%=: \n\t" - "vld1.32 {q0}, [%[a_ptr]]! \n\t" - "vld1.32 {q1}, [%[b_ptr]]! \n\t" - "vmla.f32 q10, q1, d0[0] \n\t" - "vmla.f32 q11, q1, d0[1] \n\t" - "vmla.f32 q12, q1, d1[0] \n\t" - "vmla.f32 q13, q1, d1[1] \n\t" - "subs %[kc2], %[kc2], #1 \n\t" - "bge loop_kc2_%= \n\t" - "end_kc2_%=: \n\t" - - "mov r5, %[c] \n\t" - "mov r6, %[step] \n\t" - "vst1.32 {q10}, [r5], r6 \n\t" - "vst1.32 {q11}, [r5], r6 \n\t" - "vst1.32 {q12}, [r5], r6 \n\t" - "vst1.32 {q13}, [r5] \n\t" - : - : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), - [kc2] "r"(kc2), [step] "r"(step) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q10", "q11", "q12", "q13"); -} - -void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { - const float *a_ptr, *b_ptr; - a_ptr = a; - b_ptr = b; - int kc1 = k / 4; - int kc2 = k % 4; - int step = 4 * ldc; - asm volatile( - "pld [%[a_ptr]] \n\t" - "pld [%[b_ptr]] \n\t" - - "vmov.f32 q8, #0.0 \n\t" - "vmov.f32 q9, #0.0 \n\t" - "vmov.f32 q10, #0.0 \n\t" - "vmov.f32 q11, #0.0 \n\t" - "vmov.f32 q12, #0.0 \n\t" - "vmov.f32 q13, #0.0 \n\t" - "vmov.f32 q14, #0.0 \n\t" - "vmov.f32 q15, #0.0 \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "blt end_kc1_%= \n\t" - "loop_kc1_%=: \n\t" - - "pld [%[a_ptr], #64] \n\t" - "pld [%[b_ptr], #64] \n\t" - - "vld1.32 {q0, q1}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - "vld1.32 {q4, q5}, [%[b_ptr]]! \n\t" - - "vmla.f32 q8, q2, d0[0] \n\t" - "vmla.f32 q9, q3, d0[0] \n\t" - "vmla.f32 q10, q2, d0[1] \n\t" - "vmla.f32 q11, q3, d0[1] \n\t" - "vmla.f32 q12, q2, d1[0] \n\t" - "vmla.f32 q13, q3, d1[0] \n\t" - "vmla.f32 q14, q2, d1[1] \n\t" - "vmla.f32 q15, q3, d1[1] \n\t" - - "vmla.f32 q8, q4, d2[0] \n\t" - "vmla.f32 q9, q5, d2[0] \n\t" - "vmla.f32 q10, q4, d2[1] \n\t" - "vmla.f32 q11, q5, d2[1] \n\t" - "vmla.f32 q12, q4, d3[0] \n\t" - "vmla.f32 q13, q5, d3[0] \n\t" - "vmla.f32 q14, q4, d3[1] \n\t" - "vmla.f32 q15, q5, d3[1] \n\t" - - "pld [%[b_ptr], #64] \n\t" - - "vld1.32 {q0, q1}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - "vld1.32 {q4, q5}, [%[b_ptr]]! \n\t" - - "vmla.f32 q8, q2, d0[0] \n\t" - "vmla.f32 q9, q3, d0[0] \n\t" - "vmla.f32 q10, q2, d0[1] \n\t" - "vmla.f32 q11, q3, d0[1] \n\t" - "vmla.f32 q12, q2, d1[0] \n\t" - "vmla.f32 q13, q3, d1[0] \n\t" - "vmla.f32 q14, q2, d1[1] \n\t" - "vmla.f32 q15, q3, d1[1] \n\t" - - "vmla.f32 q8, q4, d2[0] \n\t" - "vmla.f32 q9, q5, d2[0] \n\t" - "vmla.f32 q10, q4, d2[1] \n\t" - "vmla.f32 q11, q5, d2[1] \n\t" - "vmla.f32 q12, q4, d3[0] \n\t" - "vmla.f32 q13, q5, d3[0] \n\t" - "vmla.f32 q14, q4, d3[1] \n\t" - "vmla.f32 q15, q5, d3[1] \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "bge loop_kc1_%= \n\t" - "end_kc1_%=: \n\t" - - "subs %[kc2], %[kc2], #1 \n\t" - "blt end_kc2_%= \n\t" - "loop_kc2_%=: \n\t" - "vld1.32 {q0}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - "vmla.f32 q8, q2, d0[0] \n\t" - "vmla.f32 q9, q3, d0[0] \n\t" - "vmla.f32 q10, q2, d0[1] \n\t" - "vmla.f32 q11, q3, d0[1] \n\t" - "vmla.f32 q12, q2, d1[0] \n\t" - "vmla.f32 q13, q3, d1[0] \n\t" - "vmla.f32 q14, q2, d1[1] \n\t" - "vmla.f32 q15, q3, d1[1] \n\t" - "subs %[kc2], %[kc2], #1 \n\t" - "bge loop_kc2_%= \n\t" - "end_kc2_%=: \n\t" - - "mov r5, %[c] \n\t" - "mov r6, %[step] \n\t" - "vst1.32 {q8, q9}, [r5], r6 \n\t" - "vst1.32 {q10, q11}, [r5], r6 \n\t" - "vst1.32 {q12, q13}, [r5], r6 \n\t" - "vst1.32 {q14, q15}, [r5] \n\t" - : - : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), - [kc2] "r"(kc2), [step] "r"(step) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); -} - -void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { - const float *a_ptr, *b_ptr; - a_ptr = a; - b_ptr = b; - int kc1 = k / 8; - int kc2 = k % 8; - int step = sizeof(float) * ldc; - asm volatile( - "pld [%[a_ptr]] \n\t" - "pld [%[a_ptr], #64] \n\t" - "pld [%[b_ptr]] \n\t" - "pld [%[b_ptr], #64] \n\t" - - "vmov.f32 q4, #0.0 \n\t" - "vmov.f32 q5, #0.0 \n\t" - "vmov.f32 q6, #0.0 \n\t" - "vmov.f32 q7, #0.0 \n\t" - "vmov.f32 q8, #0.0 \n\t" - "vmov.f32 q9, #0.0 \n\t" - "vmov.f32 q10, #0.0 \n\t" - "vmov.f32 q11, #0.0 \n\t" - "vmov.f32 q12, #0.0 \n\t" - "vmov.f32 q13, #0.0 \n\t" - "vmov.f32 q14, #0.0 \n\t" - "vmov.f32 q15, #0.0 \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "blt 2f \n\t" - "1: \n\t" - - "pld [%[a_ptr], #128] \n\t" - "pld [%[b_ptr], #128] \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "pld [%[a_ptr], #128] \n\t" - "pld [%[b_ptr], #128] \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "pld [%[a_ptr], #128] \n\t" - "pld [%[b_ptr], #128] \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "pld [%[a_ptr], #128] \n\t" - "pld [%[b_ptr], #128] \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "bge 1b \n\t" - "2: \n\t" - - "subs %[kc2], %[kc2], #1 \n\t" - "blt 4f \n\t" - "3: \n\t" - - "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" - "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "subs %[kc2], %[kc2], #1 \n\t" - "bge 3b \n\t" - "4: \n\t" - - "mov r5, %[c] \n\t" - "mov r6, %[step] \n\t" - "vst1.32 {q4, q5}, [r5], r6 \n\t" - "vst1.32 {q6, q7}, [r5], r6 \n\t" - "vst1.32 {q8, q9}, [r5], r6 \n\t" - "vst1.32 {q10, q11}, [r5], r6 \n\t" - "vst1.32 {q12, q13}, [r5], r6 \n\t" - "vst1.32 {q14, q15}, [r5] \n\t" - - : - : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), - [kc2] "r"(kc2), [step] "r"(step) - : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} - -#endif // __aarch64__ -#endif // __ARM_NEON - -#if __ARM_NEON -#if __aarch64__ - -// 分块矩阵乘法结果回写 -// C = A * B -void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - } - } - } -} - -// C = alpha * A * B + beta * C -void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} - -// C = A * B + C -void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t cv1; - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv1 = vld1q_f32(C_ptr); - cv = vaddq_f32(cv, cv1); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv1 = vld1q_f32(C_ptr); - cv = vaddq_f32(cv, cv1); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - } - } - } -} -// C = A * B + bias -void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, - float *bias) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t biasv; - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - biasv = vld1q_dup_f32(bias + i); - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - C_ptr++; - } - } - } -} - -// C = A * B + C, relu(C) -void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t cv1; - float32x4_t zero = vdupq_n_f32(0.0); - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv1 = vld1q_f32(C_ptr); - cv = vaddq_f32(cv, cv1); - cv = vmaxq_f32(cv, zero); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv1 = vld1q_f32(C_ptr); - cv = vaddq_f32(cv, cv1); - cv = vmaxq_f32(cv, zero); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - } - } - } -} - -// C = A * B + bias, relu(C) -void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, - float *bias) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t biasv; - float32x4_t zero = vdupq_n_f32(0.0); - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - biasv = vld1q_dup_f32(bias + i); - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - cv = vmaxq_f32(cv, zero); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - cv = vmaxq_f32(cv, zero); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - C_ptr++; - } - } - } -} - -// C = A * B + C,prelu(C) -void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, - float *p, std::string mode, float *bias, - float *bias1) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t cv1; - float32x4_t biasv; - float32x4_t biasv1; - float32x4_t zero = vdupq_n_f32(0.0); - float32x4_t pv; - float *ptr = p; - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - biasv = vld1q_dup_f32(bias + i); - if (bias1 == nullptr) { - biasv1 = zero; - } else { - biasv1 = vld1q_dup_f32(bias1 + i); - } - - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - cv = vaddq_f32(cv, biasv1); - cv = vmaxq_f32(cv, zero); - cv1 = vminq_f32(cv, zero); - if (mode == "channel") { - cv1 = vmulq_n_f32(cv1, ptr[i]); - } else if (mode == "element") { - pv = vld1q_f32(ptr); - cv1 = vmulq_f32(cv1, pv); - ptr = ptr + 4; - } else { - cv1 = vmulq_n_f32(cv1, ptr[0]); - } - cv = vaddq_f32(cv, cv1); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - cv = vaddq_f32(cv, biasv1); - cv = vmaxq_f32(cv, zero); - cv1 = vminq_f32(cv, zero); - if (mode == "channel") { - cv1 = vmulq_n_f32(cv1, ptr[i]); - } else if (mode == "element") { - pv = vld1q_f32(ptr); - cv1 = vmulq_f32(cv1, pv); - ptr = ptr + 4; - } else { - cv1 = vmulq_n_f32(cv1, ptr[0]); - } - cv = vaddq_f32(cv, cv1); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - C_ptr++; - } - } - } -} - -// C = A * B, batchnorm(C) -void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc, - float *new_scale, float *new_bias) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t cv1; - float32x4_t bias; - float32x2_t scale; - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - bias = vld1q_dup_f32(new_bias); - scale = vld1_dup_f32(new_scale); - new_bias++; - new_scale++; - float scale0 = vget_lane_f32(scale, 0); - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv = vmlaq_n_f32(bias, cv, scale0); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv = vmlaq_n_f32(bias, cv, scale0); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - C_ptr++; - } - } - } -} - -// C = A * B, batchnorm(C), relu(C) -void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, - float *new_scale, float *new_bias) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t bias; - float32x2_t scale; - float32x4_t zero = vdupq_n_f32(0.0); - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - bias = vld1q_dup_f32(new_bias); - scale = vld1_dup_f32(new_scale); - new_bias++; - new_scale++; - float scale0 = vget_lane_f32(scale, 0); - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv = vmlaq_n_f32(bias, cv, scale0); - cv = vmaxq_f32(cv, zero); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv = vmlaq_n_f32(bias, cv, scale0); - cv = vmaxq_f32(cv, zero); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - } - } - } -} - -// C = A * B, batchnorm(C),C = C + bias; relu(C) -void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, - float *new_scale, float *new_bias, float *bias) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr, *bias_ptr; - float32x4_t cv; - float32x4_t nbias; - float32x2_t scale; - float32x4_t biasv; - float32x4_t zero = vdupq_n_f32(0.0); - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - bias_ptr = bias + i * ldc; - nbias = vld1q_dup_f32(new_bias); - scale = vld1_dup_f32(new_scale); - new_bias++; - new_scale++; - float scale0 = vget_lane_f32(scale, 0); - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - biasv = vld1q_f32(bias_ptr); - cv = vmlaq_n_f32(nbias, cv, scale0); - cv = vaddq_f32(cv, biasv); - cv = vmaxq_f32(cv, zero); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - bias_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - biasv = vld1q_f32(bias_ptr); - cv = vmlaq_n_f32(nbias, cv, scale0); - cv = vaddq_f32(cv, biasv); - cv = vmaxq_f32(cv, zero); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - } - } - } -} - -#else - -void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A, - int lda, const float *B, int ldb, float beta, float *C, - int ldc, bool relu) { - float *bufferC = static_cast(memory::Alloc(sizeof(float) * n)); - - const float *a0, *b0, *b1, *b2, *b3; - float *c0, *C0; - - int volatile kc1 = k / 4; - int volatile kc2 = k % 4; - int volatile nc1 = n / 16; - int _nc1 = n % 16; - int volatile nc2 = _nc1 / 4; - int volatile nc3 = _nc1 % 4; - for (int i = 0; i < kc1; i++) { - a0 = A + i * 4; - b0 = B + i * 4 * ldb; - b1 = b0 + ldb; - b2 = b1 + ldb; - b3 = b2 + ldb; - c0 = bufferC; - asm volatile( - "pld [%[a0], #16] \n\t" - "vld1.32 {q0}, [%[a0]] \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "cmp %[i], #0 \n\t" - "beq i_eq0_%= \n\t" - "bne i_ne0_%= \n\t" - - "i_eq0_%=: \n\t" - "vmov.f32 q10, #0.0 \n\t" - "vmov.f32 q11, #0.0 \n\t" - "vmov.f32 q12, #0.0 \n\t" - "vmov.f32 q13, #0.0 \n\t" - "b gemm_nc1_%= \n\t" - - "i_ne0_%=: \n\t" - "pld [%[c0], #64] \n\t" - "vld1.32 {q10, q11}, [%[c0]]! \n\t" - "vld1.32 {q12, q13}, [%[c0]] \n\t" - "sub %[c0], %[c0], #32 \n\t" - - "gemm_nc1_%=: \n\t" - "pld [%[b0], #64] \n\t" - "vld1.32 {q2, q3}, [%[b0]]! \n\t" - "vld1.32 {q4, q5}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q3, d0[0] \n\t" - "vmla.f32 q12, q4, d0[0] \n\t" - "vmla.f32 q13, q5, d0[0] \n\t" - - "pld [%[b1], #64] \n\t" - "vld1.32 {q2, q3}, [%[b1]]! \n\t" - "vld1.32 {q4, q5}, [%[b1]]! \n\t" - "vmla.f32 q10, q2, d0[1] \n\t" - "vmla.f32 q11, q3, d0[1] \n\t" - "vmla.f32 q12, q4, d0[1] \n\t" - "vmla.f32 q13, q5, d0[1] \n\t" - - "pld [%[b2], #64] \n\t" - "vld1.32 {q2, q3}, [%[b2]]! \n\t" - "vld1.32 {q4, q5}, [%[b2]]! \n\t" - "vmla.f32 q10, q2, d1[0] \n\t" - "vmla.f32 q11, q3, d1[0] \n\t" - "vmla.f32 q12, q4, d1[0] \n\t" - "vmla.f32 q13, q5, d1[0] \n\t" - - "pld [%[b3], #64] \n\t" - "vld1.32 {q2, q3}, [%[b3]]! \n\t" - "vld1.32 {q4, q5}, [%[b3]]! \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q4, d1[1] \n\t" - "vmla.f32 q13, q5, d1[1] \n\t" - - "vst1.32 {q10, q11}, [%[c0]]! \n\t" - "vst1.32 {q12, q13}, [%[c0]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "cmp %[i], #0 \n\t" - "beq ii_eq0_%= \n\t" - "bne ii_ne0_%= \n\t" - - "ii_eq0_%=: \n\t" - "vmov.f32 q10, #0.0 \n\t" - "b gemm_nc2_%= \n\t" - - "ii_ne0_%=: \n\t" - "pld [%[c0], #16] \n\t" - "vld1.32 {q10}, [%[c0]] \n\t" - - "gemm_nc2_%=: \n\t" - "pld [%[b0], #16] \n\t" - "vld1.32 {q2}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - - "pld [%[b1], #16] \n\t" - "vld1.32 {q3}, [%[b1]]! \n\t" - "vmla.f32 q10, q3, d0[1] \n\t" - - "pld [%[b2], #16] \n\t" - "vld1.32 {q4}, [%[b2]]! \n\t" - "vmla.f32 q10, q4, d1[0] \n\t" - - "pld [%[b3], #16] \n\t" - "vld1.32 {q5}, [%[b3]]! \n\t" - "vmla.f32 q10, q5, d1[1] \n\t" - - "vst1.32 {q10}, [%[c0]]! \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3), - [c0] "+r"(c0) - : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2) - : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13"); - - for (int j = 0; j < nc3; j++) { - if (i == 0) { - *c0 = (*a0) * (*b0++); - } else { - *c0 += (*a0) * (*b0++); - } - *c0 += (*(a0 + 1)) * (*b1++); - *c0 += (*(a0 + 2)) * (*b2++); - *c0 += (*(a0 + 3)) * (*b3++); - c0++; - } - } - - for (int i = 0; i < kc2; ++i) { - a0 = A + 4 * kc1 + i; - b0 = B + (4 * kc1 + i) * ldb; - c0 = bufferC; - asm volatile( - "pld [%[a0], #16] \n\t" - "vld1.32 {d0}, [%[a0]] \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "pld [%[c0], #64] \n\t" - "vld1.32 {q10, q11}, [%[c0]]! \n\t" - "vld1.32 {q12, q13}, [%[c0]] \n\t" - "sub %[c0], %[c0], #32 \n\t" - - "gemm_nc1_%=: \n\t" - "pld [%[b0], #64] \n\t" - "vld1.32 {q2, q3}, [%[b0]]! \n\t" - "vld1.32 {q4, q5}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q3, d0[0] \n\t" - "vmla.f32 q12, q4, d0[0] \n\t" - "vmla.f32 q13, q5, d0[0] \n\t" - - "vst1.32 {q10, q11}, [%[c0]]! \n\t" - "vst1.32 {q12, q13}, [%[c0]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "pld [%[c0], #16] \n\t" - "vld1.32 {q10}, [%[c0]] \n\t" - - "gemm_nc2_%=: \n\t" - "vld1.32 {q2}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - - "vst1.32 {q10}, [%[c0]]! \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3), - [c0] "+r"(c0) - : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2) - : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13"); - - for (int j = 0; j < nc3; j++) { - *c0 += (*a0) * (*b0++); - c0++; - } - } - - if (alpha != 1) { - VecWriteWithAlphaBeta(n, bufferC, C, ldc); - return; - } - if (beta == 0) { - VecWriteBasic(n, bufferC, C, ldc); - return; - } - if (beta == 1 && !relu) { - VecWriteWithAdd(n, bufferC, C, ldc); - return; - } - if (beta == 1 && relu) { - VecWriteWithAddRelu(n, bufferC, C, ldc); - return; - } -} - -void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, - int lda, const float *B, int ldb, float beta, - float *C, int ldc, bool relu, float *new_scale, - float *new_bias) { - float *bufferC = static_cast(memory::Alloc(sizeof(float) * n)); - - const float *a0, *b0, *b1, *b2, *b3; - float *c0, *C0; - - int volatile kc1 = k / 4; - int volatile kc2 = k % 4; - int volatile nc1 = n / 16; - int _nc1 = n % 16; - int volatile nc2 = _nc1 / 4; - int volatile nc3 = _nc1 % 4; - for (int i = 0; i < kc1; i++) { - a0 = A + i * 4; - b0 = B + i * 4 * ldb; - b1 = b0 + ldb; - b2 = b1 + ldb; - b3 = b2 + ldb; - c0 = bufferC; - asm volatile( - "pld [%[a0], #16] \n\t" - "vld1.32 {q0}, [%[a0]] \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "cmp %[i], #0 \n\t" - "beq i_eq0_%= \n\t" - "bne i_ne0_%= \n\t" - - "i_eq0_%=: \n\t" - "vmov.f32 q10, #0.0 \n\t" - "vmov.f32 q11, #0.0 \n\t" - "vmov.f32 q12, #0.0 \n\t" - "vmov.f32 q13, #0.0 \n\t" - "b gemm_nc1_%= \n\t" - - "i_ne0_%=: \n\t" - "pld [%[c0], #64] \n\t" - "vld1.32 {q10, q11}, [%[c0]]! \n\t" - "vld1.32 {q12, q13}, [%[c0]] \n\t" - "sub %[c0], %[c0], #32 \n\t" - - "gemm_nc1_%=: \n\t" - "pld [%[b0], #64] \n\t" - "vld1.32 {q2, q3}, [%[b0]]! \n\t" - "vld1.32 {q4, q5}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q3, d0[0] \n\t" - "vmla.f32 q12, q4, d0[0] \n\t" - "vmla.f32 q13, q5, d0[0] \n\t" - - "pld [%[b1], #64] \n\t" - "vld1.32 {q2, q3}, [%[b1]]! \n\t" - "vld1.32 {q4, q5}, [%[b1]]! \n\t" - "vmla.f32 q10, q2, d0[1] \n\t" - "vmla.f32 q11, q3, d0[1] \n\t" - "vmla.f32 q12, q4, d0[1] \n\t" - "vmla.f32 q13, q5, d0[1] \n\t" - - "pld [%[b2], #64] \n\t" - "vld1.32 {q2, q3}, [%[b2]]! \n\t" - "vld1.32 {q4, q5}, [%[b2]]! \n\t" - "vmla.f32 q10, q2, d1[0] \n\t" - "vmla.f32 q11, q3, d1[0] \n\t" - "vmla.f32 q12, q4, d1[0] \n\t" - "vmla.f32 q13, q5, d1[0] \n\t" - - "pld [%[b3], #64] \n\t" - "vld1.32 {q2, q3}, [%[b3]]! \n\t" - "vld1.32 {q4, q5}, [%[b3]]! \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q4, d1[1] \n\t" - "vmla.f32 q13, q5, d1[1] \n\t" - - "vst1.32 {q10, q11}, [%[c0]]! \n\t" - "vst1.32 {q12, q13}, [%[c0]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "cmp %[i], #0 \n\t" - "beq ii_eq0_%= \n\t" - "bne ii_ne0_%= \n\t" - - "ii_eq0_%=: \n\t" - "vmov.f32 q10, #0.0 \n\t" - "b gemm_nc2_%= \n\t" - - "ii_ne0_%=: \n\t" - "pld [%[c0], #16] \n\t" - "vld1.32 {q10}, [%[c0]] \n\t" - - "gemm_nc2_%=: \n\t" - "pld [%[b0], #16] \n\t" - "vld1.32 {q2}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - - "pld [%[b1], #16] \n\t" - "vld1.32 {q3}, [%[b1]]! \n\t" - "vmla.f32 q10, q3, d0[1] \n\t" - - "pld [%[b2], #16] \n\t" - "vld1.32 {q4}, [%[b2]]! \n\t" - "vmla.f32 q10, q4, d1[0] \n\t" - - "pld [%[b3], #16] \n\t" - "vld1.32 {q5}, [%[b3]]! \n\t" - "vmla.f32 q10, q5, d1[1] \n\t" - - "vst1.32 {q10}, [%[c0]]! \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3), - [c0] "+r"(c0) - : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2) - : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13"); - - for (int j = 0; j < nc3; j++) { - if (i == 0) { - *c0 = (*a0) * (*b0++); - } else { - *c0 += (*a0) * (*b0++); - } - *c0 += (*(a0 + 1)) * (*b1++); - *c0 += (*(a0 + 2)) * (*b2++); - *c0 += (*(a0 + 3)) * (*b3++); - c0++; - } - } - - for (int i = 0; i < kc2; ++i) { - a0 = A + 4 * kc1 + i; - b0 = B + (4 * kc1 + i) * ldb; - c0 = bufferC; - asm volatile( - "pld [%[a0], #16] \n\t" - "vld1.32 {d0}, [%[a0]] \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "pld [%[c0], #64] \n\t" - "vld1.32 {q10, q11}, [%[c0]]! \n\t" - "vld1.32 {q12, q13}, [%[c0]] \n\t" - "sub %[c0], %[c0], #32 \n\t" - - "gemm_nc1_%=: \n\t" - "pld [%[b0], #64] \n\t" - "vld1.32 {q2, q3}, [%[b0]]! \n\t" - "vld1.32 {q4, q5}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q3, d0[0] \n\t" - "vmla.f32 q12, q4, d0[0] \n\t" - "vmla.f32 q13, q5, d0[0] \n\t" - - "vst1.32 {q10, q11}, [%[c0]]! \n\t" - "vst1.32 {q12, q13}, [%[c0]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "pld [%[c0], #16] \n\t" - "vld1.32 {q10}, [%[c0]] \n\t" - - "gemm_nc2_%=: \n\t" - "vld1.32 {q2}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - - "vst1.32 {q10}, [%[c0]]! \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3), - [c0] "+r"(c0) - : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2) - : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13"); - - for (int j = 0; j < nc3; j++) { - *c0 += (*a0) * (*b0++); - c0++; - } - } - - if (relu) { - VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias); - } else { - VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias); - } -} - -// C = A * B -void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) { - int nc1 = nc / 16; - int _nc1 = nc % 16; - int step = 4 * ldc; - int step1 = 4 * (NC - 16 * nc1); - int volatile m = mc; - - float *volatile c_ptr, *volatile C_ptr; - float *C0, *c0; - c_ptr = c; - C_ptr = C; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vst1.32 {q0, q1}, [r6]! \n\t" - - "vld1.32 {q2, q3}, [%[c_ptr]]! \n\t" - "vst1.32 {q2, q3}, [r6]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), - [step] "r"(step), [step1] "r"(step1) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3"); - } - - if (_nc1 != 0) { - for (int i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 16 + i * ldc; - c0 = c_ptr + nc1 * 16 + i * NC; - for (int j = 0; j < _nc1; j++) { - *C0++ = *c0++; - } - } - } -} - -// C = alpha * A * B + beta * C -void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} - -// C = A * B + C -void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { - int nc1 = nc / 16; - int _nc1 = nc % 16; - int step = 4 * ldc; - int step1 = 4 * (NC - 16 * nc1); - int volatile m = mc; - - float *volatile c_ptr, *volatile C_ptr; - float *C0, *c0; - c_ptr = c; - C_ptr = C; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [r6] \n\t" - "vld1.32 {q2, q3}, [%[c_ptr]]! \n\t" - "vadd.f32 q10, q0, q2 \n\t" - "vadd.f32 q11, q1, q3 \n\t" - "vst1.32 {q10, q11}, [r6]! \n\t" - - "vld1.32 {q4, q5}, [r6] \n\t" - "vld1.32 {q6, q7}, [%[c_ptr]]! \n\t" - "vadd.f32 q12, q4, q6 \n\t" - "vadd.f32 q13, q5, q7 \n\t" - "vst1.32 {q12, q13}, [r6]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), - [step] "r"(step), [step1] "r"(step1) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q10", "q11", "q12", "q13"); - } - - if (_nc1 != 0) { - for (int i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 16 + i * ldc; - c0 = c_ptr + nc1 * 16 + i * NC; - for (int j = 0; j < _nc1; j++) { - *C0++ += *c0++; - } - } - } -} - -// C = A * B + bias -void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, - float *bias) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t biasv; - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - biasv = vld1q_dup_f32(bias + i); - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - C_ptr++; - } - } - } -} - -// C = A * B + C, relu(C) -void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { - int nc1 = nc / 16; - int _nc1 = nc % 16; - int step = 4 * ldc; - int step1 = 4 * (NC - 16 * nc1); - int volatile m = mc; - - float *volatile c_ptr, *volatile C_ptr; - float *C0, *c0; - c_ptr = c; - C_ptr = C; - if (nc1 > 0) { - asm volatile( - "vmov.f32 q14, #0.0 \n\t" - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [r6] \n\t" - "vld1.32 {q2, q3}, [%[c_ptr]]! \n\t" - "vadd.f32 q10, q0, q2 \n\t" - "vadd.f32 q11, q1, q3 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vmax.f32 q11, q11, q14 \n\t" - "vst1.32 {q10, q11}, [r6]! \n\t" - - "vld1.32 {q4, q5}, [r6] \n\t" - "vld1.32 {q6, q7}, [%[c_ptr]]! \n\t" - "vadd.f32 q12, q4, q6 \n\t" - "vadd.f32 q13, q5, q7 \n\t" - "vmax.f32 q12, q12, q14 \n\t" - "vmax.f32 q13, q13, q14 \n\t" - "vst1.32 {q12, q13}, [r6]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), - [step] "r"(step), [step1] "r"(step1) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q10", "q11", "q12", "q13"); - } - - if (_nc1 != 0) { - for (int i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 16 + i * ldc; - c0 = c_ptr + nc1 * 16 + i * NC; - for (int j = 0; j < _nc1; j++) { - *C0 += *c0; - if (*C0 < 0) { - *C0 = 0; - } - C0++; - c0++; - } - } - } -} - -// C = A * B + bias, relu(C) -void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, - float *bias) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr; - float32x4_t cv; - float32x4_t biasv; - float32x4_t zero = vdupq_n_f32(0.0); - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - biasv = vld1q_dup_f32(bias + i); - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - cv = vmaxq_f32(cv, zero); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv = vaddq_f32(cv, biasv); - cv = vmaxq_f32(cv, zero); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - C_ptr++; - } - } - } -} - -void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, - float *p, std::string mode, float *bias, - float *bias1) { - if (nc < 4) { - if (bias1 == nullptr) { - for (int i = 0; i < mc; ++i) { - for (int j = 0; j < nc; ++j) { - float r = c[i * NC + j] + bias[i]; - if (r < 0) { - r *= p[i]; - } - C[i * ldc + j] = r; - } - } - } else { - for (int i = 0; i < mc; ++i) { - for (int j = 0; j < nc; ++j) { - float r = c[i * NC + j] + bias[i]; - r += bias1[i * ldc + j]; - if (r < 0) { - r *= p[i]; - } - C[i * ldc + j] = r; - } - } - } - return; - } - - int nc1 = nc / 16; - int _nc1 = nc % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); - int step = 4 * (ldc - nc); - int step1 = 4 * (NC - nc); - - if (bias1 == nullptr) { - asm volatile( - "vmov.f32 q14, #0.0 \n\t" - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r5, %[nc1] \n\t" - "mov r6, %[nc2] \n\t" - "vld1.32 {d0}, [%[bias]] \n\t" - "vld1.32 {d1}, [%[p]] \n\t" - "vdup.32 q1, d0[0] \n\t" - "vdup.32 q2, d1[0] \n\t" - - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "pld [%[c], #32] \n\t" - "vld1.32 {q3, q4}, [%[c]]! \n\t" - "vld1.32 {q9, q10}, [%[c]]! \n\t" - - "vadd.f32 q3, q3, q1 \n\t" - "vadd.f32 q4, q4, q1 \n\t" - "vadd.f32 q9, q9, q1 \n\t" - "vadd.f32 q10, q10, q1 \n\t" - - "vmax.f32 q5, q3, q14 \n\t" - "vmin.f32 q7, q3, q14 \n\t" - "vmax.f32 q6, q4, q14 \n\t" - "vmin.f32 q8, q4, q14 \n\t" - - "vmax.f32 q11, q9, q14 \n\t" - "vmin.f32 q13, q9, q14 \n\t" - "vmax.f32 q12, q10, q14 \n\t" - "vmin.f32 q15, q10, q14 \n\t" - - "vmla.f32 q5, q7, q2 \n\t" - "vmla.f32 q6, q8, q2 \n\t" - "vmla.f32 q11, q13, q2 \n\t" - "vmla.f32 q12, q15, q2 \n\t" - - "vst1.32 {q5, q6}, [%[C]]! \n\t" - "vst1.32 {q11, q12}, [%[C]]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs r6, r6, #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "vld1.32 {q3}, [%[c]]! \n\t" - "vadd.f32 q3, q3, q1 \n\t" - "vmax.f32 q5, q3, q14 \n\t" - "vmin.f32 q7, q3, q14 \n\t" - "vmla.f32 q5, q7, q2 \n\t" - "vst1.32 {q5}, [%[C]]! \n\t" - - "subs r6, r6, #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - - "sub %[c], %[c], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - - "vld1.32 {q4}, [%[c]]! \n\t" - "vadd.f32 q4, q4, q1 \n\t" - "vmax.f32 q6, q4, q14 \n\t" - "vmin.f32 q8, q4, q14 \n\t" - "vmla.f32 q6, q8, q2 \n\t" - "vst1.32 {q6}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" - - "add %[p], %[p], #4 \n\t" - "add %[bias], %[bias], #4 \n\t" - "add %[c], %[c], %[step1] \n\t" - "add %[C], %[C], %[step] \n\t" - - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2), - [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p), - [bias] "r"(bias), [bias1] "r"(bias1) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8"); - } else { - asm volatile( - "vmov.f32 q14, #0.0 \n\t" - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r5, %[nc1] \n\t" - "mov r6, %[nc2] \n\t" - "vld1.32 {d0}, [%[bias]] \n\t" - "vld1.32 {d1}, [%[p]] \n\t" - "vdup.32 q1, d0[0] \n\t" - "vdup.32 q2, d1[0] \n\t" - - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "pld [%[c], #32] \n\t" - "pld [%[bias1], #32] \n\t" - "vld1.32 {q3, q4}, [%[c]]! \n\t" - "vld1.32 {q9, q10}, [%[bias1]]! \n\t" - "vadd.f32 q3, q3, q1 \n\t" - "vadd.f32 q4, q4, q1 \n\t" - "vadd.f32 q3, q3, q9 \n\t" - "vadd.f32 q4, q4, q10 \n\t" - "vmax.f32 q5, q3, q14 \n\t" - "vmin.f32 q7, q3, q14 \n\t" - "vmax.f32 q6, q4, q14 \n\t" - "vmin.f32 q8, q4, q14 \n\t" - "vmla.f32 q5, q7, q2 \n\t" - "vmla.f32 q6, q8, q2 \n\t" - "vst1.32 {q5, q6}, [%[C]]! \n\t" - - "vld1.32 {q3, q4}, [%[c]]! \n\t" - "vld1.32 {q9, q10}, [%[bias1]]! \n\t" - "vadd.f32 q3, q3, q1 \n\t" - "vadd.f32 q4, q4, q1 \n\t" - "vadd.f32 q3, q3, q9 \n\t" - "vadd.f32 q4, q4, q10 \n\t" - "vmax.f32 q5, q3, q14 \n\t" - "vmin.f32 q7, q3, q14 \n\t" - "vmax.f32 q6, q4, q14 \n\t" - "vmin.f32 q8, q4, q14 \n\t" - "vmla.f32 q5, q7, q2 \n\t" - "vmla.f32 q6, q8, q2 \n\t" - "vst1.32 {q5, q6}, [%[C]]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs r6, r6, #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "vld1.32 {q3}, [%[c]]! \n\t" - "vld1.32 {q9}, [%[bias1]]! \n\t" - "vadd.f32 q3, q3, q1 \n\t" - "vadd.f32 q3, q3, q9 \n\t" - "vmax.f32 q5, q3, q14 \n\t" - "vmin.f32 q7, q3, q14 \n\t" - "vmla.f32 q5, q7, q2 \n\t" - "vst1.32 {q5}, [%[C]]! \n\t" - - "subs r6, r6, #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - - "sub %[c], %[c], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - "sub %[bias1], %[bias1], %[nc3] \n\t" - - "vld1.32 {q4}, [%[c]]! \n\t" - "vld1.32 {q10}, [%[bias1]]! \n\t" - "vadd.f32 q4, q4, q1 \n\t" - "vadd.f32 q4, q4, q10 \n\t" - "vmax.f32 q6, q4, q14 \n\t" - "vmin.f32 q8, q4, q14 \n\t" - "vmla.f32 q6, q8, q2 \n\t" - "vst1.32 {q6}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" - - "add %[p], %[p], #4 \n\t" - "add %[bias], %[bias], #4 \n\t" - "add %[c], %[c], %[step1] \n\t" - "add %[C], %[C], %[step] \n\t" - "add %[bias1], %[bias1], %[step] \n\t" - - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2), - [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p), - [bias] "r"(bias), [bias1] "r"(bias1) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10"); - } -} - -// C = A * B, batchnorm(C) -void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc, - float *scale, float *bias) { - if (nc < 4) { - for (int i = 0; i < mc; ++i) { - for (int j = 0; j < nc; ++j) { - *C = (*c) * (*scale) + (*bias); - C++; - c++; - } - C += (ldc - nc); - c += (NC - nc); - scale++; - bias++; - } - return; - } - - int volatile nc1 = nc / 16; - int _nc1 = nc % 16; - int volatile nc2 = _nc1 / 4; - int volatile nc3 = 16 - 4 * (_nc1 % 4); - int volatile step = 4 * (ldc - nc); - int volatile step1 = 4 * (NC - nc); - - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r5, %[nc1] \n\t" - "mov r6, %[nc2] \n\t" - "vld1.32 {d0}, [%[scale]] \n\t" - "vld1.32 {d1}, [%[bias]] \n\t" - "vdup.32 q1, d0[0] \n\t" - "vdup.32 q2, d1[0] \n\t" - - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q3, q4}, [%[c]]! \n\t" - "vmul.f32 q10, q3, q1 \n\t" - "vmul.f32 q11, q4, q1 \n\t" - "vadd.f32 q10, q10, q2 \n\t" - "vadd.f32 q11, q11, q2 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" - - "vld1.32 {q5, q6}, [%[c]]! \n\t" - "vmul.f32 q12, q5, q1 \n\t" - "vmul.f32 q13, q6, q1 \n\t" - "vadd.f32 q12, q12, q2 \n\t" - "vadd.f32 q13, q13, q2 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs r6, r6, #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "vld1.32 {q7}, [%[c]]! \n\t" - "vmul.f32 q10, q7, q1 \n\t" - "vadd.f32 q10, q10, q2 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" - - "subs r6, r6, #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - - "sub %[c], %[c], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - - "vld1.32 {q8}, [%[c]]! \n\t" - "vmul.f32 q11, q8, q1 \n\t" - "vadd.f32 q11, q11, q2 \n\t" - "vst1.32 {q11}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" - - "add %[scale], %[scale], #4 \n\t" - "add %[bias], %[bias], #4 \n\t" - "add %[c], %[c], %[step1] \n\t" - "add %[C], %[C], %[step] \n\t" - - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2), - [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), - [scale] "r"(scale), [bias] "r"(bias) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q10", "q11", "q12", "q13"); -} - -// C = A * B, batchnorm(C), relu(C) -void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, - float *scale, float *bias) { - if (nc < 4) { - for (int i = 0; i < mc; ++i) { - for (int j = 0; j < nc; ++j) { - *C = (*c) * (*scale) + (*bias); - if (*C < 0) { - *C = 0; - } - C++; - c++; - } - C += (ldc - nc); - c += (NC - nc); - scale++; - bias++; - } - return; - } - - int nc1 = nc / 16; - int _nc1 = nc % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); - int step = 4 * (ldc - nc); - int step1 = 4 * (NC - nc); - - asm volatile( - "vmov.f32 q14, #0.0 \n\t" - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r5, %[nc1] \n\t" - "mov r6, %[nc2] \n\t" - "vld1.32 {d0}, [%[scale]] \n\t" - "vld1.32 {d1}, [%[bias]] \n\t" - "vdup.32 q1, d0[0] \n\t" - "vdup.32 q2, d1[0] \n\t" - - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q3, q4}, [%[c]]! \n\t" - "vmul.f32 q10, q3, q1 \n\t" - "vmul.f32 q11, q4, q1 \n\t" - "vadd.f32 q10, q10, q2 \n\t" - "vadd.f32 q11, q11, q2 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vmax.f32 q11, q11, q14 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" - - "vld1.32 {q5, q6}, [%[c]]! \n\t" - "vmul.f32 q12, q5, q1 \n\t" - "vmul.f32 q13, q6, q1 \n\t" - "vadd.f32 q12, q12, q2 \n\t" - "vadd.f32 q13, q13, q2 \n\t" - "vmax.f32 q12, q12, q14 \n\t" - "vmax.f32 q13, q13, q14 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs r6, r6, #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "vld1.32 {q7}, [%[c]]! \n\t" - "vmul.f32 q10, q7, q1 \n\t" - "vadd.f32 q10, q10, q2 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" - - "subs r6, r6, #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - - "sub %[c], %[c], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - - "vld1.32 {q8}, [%[c]]! \n\t" - "vmul.f32 q11, q8, q1 \n\t" - "vadd.f32 q11, q11, q2 \n\t" - "vmax.f32 q11, q11, q14 \n\t" - "vst1.32 {q11}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" - - "add %[scale], %[scale], #4 \n\t" - "add %[bias], %[bias], #4 \n\t" - "add %[c], %[c], %[step1] \n\t" - "add %[C], %[C], %[step] \n\t" - - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2), - [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), - [scale] "r"(scale), [bias] "r"(bias) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q10", "q11", "q12", "q13", "q14"); -} - -// C = A * B, batchnorm(C),C = C + bias; relu(C) -void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, - float *new_scale, float *new_bias, float *bias) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float *c_ptr, *C_ptr, *bias_ptr; - float32x4_t cv; - float32x4_t nbias; - float32x2_t scale; - float32x4_t biasv; - float32x4_t zero = vdupq_n_f32(0.0); - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - bias_ptr = bias + i * ldc; - nbias = vld1q_dup_f32(new_bias); - scale = vld1_dup_f32(new_scale); - new_bias++; - new_scale++; - float scale0 = vget_lane_f32(scale, 0); - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - biasv = vld1q_f32(bias_ptr); - cv = vmlaq_n_f32(nbias, cv, scale0); - cv = vaddq_f32(cv, biasv); - cv = vmaxq_f32(cv, zero); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - bias_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - biasv = vld1q_f32(bias_ptr); - cv = vmlaq_n_f32(nbias, cv, scale0); - cv = vaddq_f32(cv, biasv); - cv = vmaxq_f32(cv, zero); - if (_nc1 >= 1) { - vst1q_lane_f32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_f32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_f32(C_ptr, cv, 2); - } - } - } -} - -// C = A * B -void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) { - int nc1 = n / 16; - int _nc1 = n % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); - - asm volatile( - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vst1.32 {q0, q1}, [%[C]]! \n\t" - - "vld1.32 {q2, q3}, [%[c]]! \n\t" - "vst1.32 {q2, q3}, [%[C]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "vld1.32 {q4}, [%[c]]! \n\t" - "vst1.32 {q4}, [%[C]]! \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - "sub %[c], %[c], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - "vld1.32 {q5}, [%[c]]! \n\t" - "vst1.32 {q5}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" - - : - : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5"); -} - -// C = alpha * A * B + beta * C -void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} - -// C = A * B + C -void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) { - int nc1 = n / 16; - int _nc1 = n % 16; - - asm volatile( - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vld1.32 {q2, q3}, [%[C]] \n\t" - "vadd.f32 q10, q0, q2 \n\t" - "vadd.f32 q11, q1, q3 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" - - "vld1.32 {q4, q5}, [%[c]]! \n\t" - "vld1.32 {q6, q7}, [%[C]] \n\t" - "vadd.f32 q12, q4, q6 \n\t" - "vadd.f32 q13, q5, q7 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - : [C] "+r"(C), [c] "+r"(c) - : [nc1] "r"(nc1) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", - "q12", "q13"); - - if (_nc1 != 0) { - for (int j = 0; j < _nc1; j++) { - *C++ += *c++; - } - } -} - -// C = A * B + C, relu(C) -void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { - int nc1 = n / 16; - int _nc1 = n % 16; - - asm volatile( - "vmov.f32 q14, #0.0 \n\t" - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vld1.32 {q2, q3}, [%[C]] \n\t" - "vadd.f32 q10, q0, q2 \n\t" - "vadd.f32 q11, q1, q3 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vmax.f32 q11, q11, q14 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" - - "vld1.32 {q4, q5}, [%[c]]! \n\t" - "vld1.32 {q6, q7}, [%[C]] \n\t" - "vadd.f32 q12, q4, q6 \n\t" - "vadd.f32 q13, q5, q7 \n\t" - "vmax.f32 q12, q12, q14 \n\t" - "vmax.f32 q13, q13, q14 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - : [C] "+r"(C), [c] "+r"(c) - : [nc1] "r"(nc1) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", - "q12", "q13"); - - if (_nc1 != 0) { - for (int j = 0; j < _nc1; j++) { - *C += *c; - if (*C < 0) { - *C = 0; - } - C++; - c++; - } - } -} - -// C = A * B, batchnorm(C) -void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, - float *bias) { - int nc1 = n / 16; - int _nc1 = n % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); - - asm volatile( - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vld1.32 {q2, q3}, [%[scale]]! \n\t" - "vld1.32 {q10, q11}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q2 \n\t" - "vmla.f32 q11, q1, q3 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" - - "vld1.32 {q4, q5}, [%[c]]! \n\t" - "vld1.32 {q6, q7}, [%[scale]]! \n\t" - "vld1.32 {q12, q13}, [%[bias]]! \n\t" - "vmla.f32 q12, q4, q6 \n\t" - "vmla.f32 q13, q5, q7 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "vld1.32 {q0}, [%[c]]! \n\t" - "vld1.32 {q1}, [%[scale]]! \n\t" - "vld1.32 {q10}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q1 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - - "sub %[c], %[c], %[nc3] \n\t" - "sub %[scale], %[scale], %[nc3] \n\t" - "sub %[bias], %[bias], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - - "vld1.32 {q0}, [%[c]]! \n\t" - "vld1.32 {q1}, [%[scale]]! \n\t" - "vld1.32 {q10}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q1 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" - - : - : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3), - [scale] "r"(scale), [bias] "r"(bias) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", - "q12", "q13"); -} - -// C = A * B, batchnorm(C), relu(C) -void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale, - float *bias) { - int nc1 = n / 16; - int _nc1 = n % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); - - asm volatile( - "vmov.f32 q14, #0.0 \n\t" - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vld1.32 {q2, q3}, [%[scale]]! \n\t" - "vld1.32 {q10, q11}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q2 \n\t" - "vmla.f32 q11, q1, q3 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vmax.f32 q11, q11, q14 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" - - "vld1.32 {q4, q5}, [%[c]]! \n\t" - "vld1.32 {q6, q7}, [%[scale]]! \n\t" - "vld1.32 {q12, q13}, [%[bias]]! \n\t" - "vmla.f32 q12, q4, q6 \n\t" - "vmla.f32 q13, q5, q7 \n\t" - "vmax.f32 q12, q12, q14 \n\t" - "vmax.f32 q13, q13, q14 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" - - "vld1.32 {q0}, [%[c]]! \n\t" - "vld1.32 {q1}, [%[scale]]! \n\t" - "vld1.32 {q10}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q1 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" - - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" - - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - - "sub %[c], %[c], %[nc3] \n\t" - "sub %[scale], %[scale], %[nc3] \n\t" - "sub %[bias], %[bias], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - - "vld1.32 {q0}, [%[c]]! \n\t" - "vld1.32 {q1}, [%[scale]]! \n\t" - "vld1.32 {q10}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q1 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" - - : - : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3), - [scale] "r"(scale), [bias] "r"(bias) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", - "q12", "q13", "q14"); -} - -#endif // __aarch64__ -#endif // __ARM_NEON - -// 32位 float 矩阵乘法 -void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *bias) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 32 * 1024; - int L2 = 512 * 1024; - - KC = k; - MC = L1 / (KC * sizeof(float)); - NC = L2 / (KC * sizeof(float)); - - // make sure MC is multiple of MR, and NC is multiple of NR - if (MC == 0) { - MC = MR; - } else { - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - } - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - if (NC == 0) { - NC = NR; - } else { - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - } - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); -#if __aarch64__ - // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB); - PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false); -#else - PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false); -#endif - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); -#if __aarch64__ - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false); - // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA); -#else - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false); -#endif - if (bias == nullptr) { - InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC, - &C(i, j), ldc, relu, nullptr); - } else { - InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC, - &C(i, j), ldc, relu, bias + i); - } - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); -} - -void Gemm::SgemmWithBn(int m, int n, int k, float alpha, const float *A, - int lda, const float *B, int ldb, float beta, float *C, - int ldc, bool relu, float *new_scale, float *new_bias, - float *bias) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 32 * 1024; - int L2 = 512 * 1024; - - KC = k; - MC = L1 / (KC * sizeof(float)); - NC = L2 / (KC * sizeof(float)); - - // make sure MC is multiple of MR, and NC is multiple of NR - if (MC == 0) { - MC = MR; - } else { - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - } - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - if (NC == 0) { - NC = NR; - } else { - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - } - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); -#if __aarch64__ - // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB); - PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false); -#else - PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false); -#endif - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); -#if __aarch64__ - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false); - // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA); -#else - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false); -#endif - if (bias == nullptr) { - InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, - &C(i, j), ldc, relu, new_scale + i, new_bias + i); - } else { - InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC, - &C(i, j), ldc, relu, new_scale + i, new_bias + i, - bias + i * ldc + j); - } - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); -} - -void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda, - const float *B, int ldb, float *C, int ldc, float *p, - std::string mode, float *bias, float *bias1) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 32 * 1024; - int L2 = 0.5 * 1024 * 1024; - - KC = k; - MC = L1 / (KC * sizeof(float)); - NC = L2 / (KC * sizeof(float)); - - // make sure MC is multiple of MR, and NC is multiple of NR - if (MC == 0) { - MC = MR; - } else { - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - } - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - if (NC == 0) { - NC = NR; - } else { - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - } - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); -#if __aarch64__ - // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB); - PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false); -#else - PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false); -#endif - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); -#if __aarch64__ - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false); - // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA); -#else - PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false); -#endif - if (bias1 == nullptr) { - InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc, - p + i, mode, bias + i, nullptr); - } else { - InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc, - p + i, mode, bias + i, bias1 + i * ldc + j); - } - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); -} - -// 32位 float 矩阵乘法 -void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *bias) { -#ifndef __aarch64__ - if (m == 1 && bias == nullptr) { - return VectorKernel(m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, relu); - } -#endif // __aarch64__ -#ifdef _OPENMP - int max_threads = omp_get_max_threads(); -#else - int max_threads = 1; -#endif - - // int L1 = 64 / max_threads * 1024; - int L = (max_threads > 2) ? 64 : 32; - int L1 = L / max_threads * 1024; - KC = k; - if (m > n) { - // 对 A 分块 - MC = L1 / (KC * sizeof(float)); - if (MC == 0) { - MC = MR; - } else { - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - } - // 补齐 B - NC = (n + NR - 1) / NR * NR; - -#if __aarch64__ - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_16c; - procAddDot = &Gemm::AddDot6x16; -#else - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_8c; - procAddDot = &Gemm::AddDot6x8; -#endif - - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true); - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); - } else { - // 对 B 分块 - NC = L1 / (KC * sizeof(float)); - if (NC == 0) { - NC = NR; - } else { - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - } - // 补齐 A - MC = (m + MR - 1) / MR * MR; - -#if __aarch64__ - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_16c; - procAddDot = &Gemm::AddDot6x16; -#else - - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_8c; - procAddDot = &Gemm::AddDot6x8; -#endif - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); - } - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads)); - - if (m > n) { -#pragma omp parallel for - for (int i = 0; i < m; i += MC) { -#ifdef _OPENMP - int local_threads = omp_get_thread_num(); -#else - int local_threads = 0; -#endif - - int mc; - mc = s_min(m - i, MC); - float *local_A = packedA + MC * KC * local_threads; - float *local_C = packedC + MC * NC * local_threads; - (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false); - if (bias == nullptr) { - InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C, - &C(i, 0), ldc, relu, nullptr); - } else { - InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C, - &C(i, 0), ldc, relu, bias + i); - } - } - } else { -#pragma omp parallel for - for (int j = 0; j < n; j += NC) { -#ifdef _OPENMP - int local_threads = omp_get_thread_num(); -#else - int local_threads = 0; -#endif - - int nc; - nc = s_min(n - j, NC); - float *local_B = packedB + KC * NC * local_threads; - float *local_C = packedC + MC * NC * local_threads; - (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false); - InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C, - &C(0, j), ldc, relu, bias); - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); -} - -void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, - int lda, const float *B, int ldb, float beta, - float *C, int ldc, bool relu, float *new_scale, - float *new_bias, float *bias) { -#ifdef _OPENMP - int max_threads = omp_get_max_threads(); -#else - int max_threads = 1; -#endif - - int L1 = 64 / max_threads * 1024; - KC = k; - if (m > n) { - // 对 A 分块 - MC = L1 / (KC * sizeof(float)); - if (MC == 0) { - MC = MR; - } else { - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - } - // 补齐 B - NC = (n + NR - 1) / NR * NR; - -#if __aarch64__ - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_16c; - procAddDot = &Gemm::AddDot6x16; -#else - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_8c; - procAddDot = &Gemm::AddDot6x8; -#endif - - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true); - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); - } else { - // 对 B 分块 - NC = L1 / (KC * sizeof(float)); - if (NC == 0) { - NC = NR; - } else { - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - } - // 补齐 A - MC = (m + MR - 1) / MR * MR; - -#if __aarch64__ - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_16c; - procAddDot = &Gemm::AddDot6x16; -#else - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_8c; - procAddDot = &Gemm::AddDot6x8; -#endif - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); - } - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads)); - - if (m > n) { -#pragma omp parallel for - for (int i = 0; i < m; i += MC) { -#ifdef _OPENMP - int local_threads = omp_get_thread_num(); -#else - int local_threads = 0; -#endif - - int mc; - mc = s_min(m - i, MC); - float *local_A = packedA + MC * KC * local_threads; - float *local_C = packedC + MC * NC * local_threads; - (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false); - if (bias == nullptr) { - InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C, - &C(i, 0), ldc, relu, new_scale + i, new_bias + i); - } else { - InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C, - &C(i, 0), ldc, relu, new_scale + i, new_bias + i, - bias + i * ldc); - } - } - } else { -#pragma omp parallel for - for (int j = 0; j < n; j += NC) { -#ifdef _OPENMP - int local_threads = omp_get_thread_num(); -#else - int local_threads = 0; -#endif - - int nc; - nc = s_min(n - j, NC); - float *local_B = packedB + KC * NC * local_threads; - float *local_C = packedC + MC * NC * local_threads; - (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false); - if (bias == nullptr) { - InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C, - &C(0, j), ldc, relu, new_scale, new_bias); - } else { - InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C, - &C(0, j), ldc, relu, new_scale, new_bias, - bias + j); - } - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); -} - -void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, - const float *B, int ldb, float *C, int ldc, - float *p, std::string mode, float *bias, - float *bias1) { -#ifdef _OPENMP - int max_threads = omp_get_max_threads(); -#else - int max_threads = 1; -#endif - - int L1 = 8 * 1024; - KC = k; - if (m > n) { - // 对 A 分块 - MC = L1 / (KC * sizeof(float)); - if (MC == 0) { - MC = MR; - } else { - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; - } - // 补齐 B - NC = (n + NR - 1) / NR * NR; - -#if __aarch64__ - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_16c; - procAddDot = &Gemm::AddDot6x16; -#else - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_8c; - procAddDot = &Gemm::AddDot6x8; -#endif - - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true); - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); - } else { - // 对 B 分块 - NC = L1 / (KC * sizeof(float)); - if (NC == 0) { - NC = NR; - } else { - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; - } - // 补齐 A - MC = (m + MR - 1) / MR * MR; - -#if __aarch64__ - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_16c; - procAddDot = &Gemm::AddDot6x16; -#else - procPackA = &Gemm::PackMatrixA_6r; - procPackB = &Gemm::PackMatrixB_8c; - procAddDot = &Gemm::AddDot6x8; -#endif - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); - } - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads)); - - if (m > n) { -#pragma omp parallel for - for (int i = 0; i < m; i += MC) { -#ifdef _OPENMP - int local_threads = omp_get_thread_num(); -#else - int local_threads = 0; -#endif - - int mc; - mc = s_min(m - i, MC); - float *local_A = packedA + MC * KC * local_threads; - float *local_C = packedC + MC * NC * local_threads; - (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false); - if (bias1 == nullptr) { - InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc, - p + i, mode, bias + i, nullptr); - } else { - InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc, - p + i, mode, bias + i, bias1 + i * ldc); - } - } - } else { -#pragma omp parallel for - for (int j = 0; j < n; j += NC) { -#ifdef _OPENMP - int local_threads = omp_get_thread_num(); -#else - int local_threads = 0; -#endif - - int nc; - nc = s_min(n - j, NC); - float *local_B = packedB + KC * NC * local_threads; - float *local_C = packedC + MC * NC * local_threads; - (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false); - if (bias1 == nullptr) { - InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p, - mode, bias, nullptr); - } else { - InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p, - mode, bias, bias1 + j); - } - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/gemm.h b/mobile/src/operators/math/gemm.h deleted file mode 100644 index fdbae47112..0000000000 --- a/mobile/src/operators/math/gemm.h +++ /dev/null @@ -1,492 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "common/log.h" -#include "memory/t_malloc.h" -#ifdef _OPENMP -#include -#endif - -// 矩阵取值运算宏,假设矩阵按行存储 -#define A(i, j) A[(i)*lda + (j)] -#define B(i, j) B[(i)*ldb + (j)] -#define C(i, j) C[(i)*ldc + (j)] - -#if __aarch64__ -#define MR_INT8 4 -#define NR_INT8 4 -#define MR 6 -#define NR 16 -#else -#define MR_INT8 4 -#define NR_INT8 2 -#define MR 6 -#define NR 8 -#endif - -#define s_min(i, j) ((i) < (j) ? (i) : (j)) - -namespace paddle_mobile { -namespace operators { -namespace math { - -class Gemm { - public: - typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *, - const bool); - typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *, - int); - FnPack procPackA; - FnPack procPackB; - FnAddDot procAddDot; - - void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, - float *buffer, const bool parallel); - void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, - float *buffer, const bool parallel); - void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, - float *buffer, const bool parallel); -#if __aarch64__ - void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, - float *buffer, const bool parallel); - void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, - float *buffer, const bool parallel); -#endif - - // 分块矩阵乘法 - void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, - float beta, float *c, float *C, int ldc, bool relu); - void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, - const float *b, float beta, float *c, float *C, - int ldc, bool relu, float *bias); - - void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, - const float *b, float beta, float *c, float *C, - int ldc, bool relu, float *new_scale, float *new_bias); - void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a, - const float *b, float beta, float *c, float *C, - int ldc, bool relu, float *new_scale, - float *new_bias, float *bias); - void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, - float *c, float *C, int ldc, float *p, - std::string mode, float *bias, float *bias1); - - // 计算一个更小的 C 矩阵分块 -#if __aarch64__ - void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc); - void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc); - void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc); -#else - void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); - void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); - void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc); -#endif - - // 分块矩阵乘法结果回写 - // C = A * B - void WriteBasic(int mc, int nc, float *c, float *C, int ldc); - // C = alpha * A * B + beta * C - void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc); - // C = A * B + C - void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc); - // C = A * B + bias - void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias); - // C = A * B + C, relu(C) - void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc); - // C = A * B + C,prelu(C) - void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, - std::string mode, float *bias, float *bias1); - // C = A * B + bias ,relu(C) - void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, - float *bias); - // C = A * B, batchnorm(C) - void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, - float *new_scale, float *new_bias); - // C = A * B, batchnorm(C), relu(C) - void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, - float *new_scale, float *new_bias); - void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, - float *new_scale, float *new_bias, float *bias1); - - // 向量矩阵乘法 (M = 1) -#if __aarch64__ -#else - void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu); - - void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, - int lda, const float *B, int ldb, float beta, - float *C, int ldc, bool relu, float *new_scale, - float *new_bias); - - // 向量矩阵乘法结果回写 - // C = A * B - void VecWriteBasic(int n, float *c, float *C, int ldc); - // C = alpha * A * B + beta * C - void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc); - // C = A * B + C - void VecWriteWithAdd(int n, float *c, float *C, int ldc); - // C = A * B + C, relu(C) - void VecWriteWithAddRelu(int n, float *c, float *C, int ldc); - // C = A * B, batchnorm(C) - void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale, - float *new_bias); - // C = A * B, batchnorm(C), relu(C) - void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, - float *new_bias); -#endif - - // 32位 float 矩阵乘法 - void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, bool relu, - float *bias); - - // 32位 float 矩阵乘法, 并对结果进行 batchnrom - void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias, float *bias); - - void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, - const float *B, int ldb, float *C, int ldc, float *p, - std::string mode, float *bias, float *bias1); - - // 32位 float 矩阵乘法(openmp 多线程版本) - void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *bias); - - // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本) - void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, - int lda, const float *B, int ldb, float beta, float *C, - int ldc, bool relu, float *new_scale, float *new_bias, - float *bias); - - void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, - const float *B, int ldb, float *C, int ldc, float *p, - std::string mode, float *bias, float *bias1); - - // 8 bits function cluster begins - // 8 bits int small block inner product, data packed k = 1 - void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc); - void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc); - // 8 bits int small block inner product, data packed k = 16 - void AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc); - void AddDot4x4(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc); - - // 8 bits int inner product - template - void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a, - const int8_t *b, float beta, int32_t *c, Otype *C, - int32_t ldc, bool relu); - template - void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a, - const int8_t *b, float beta, int32_t *c, Otype *C, - int32_t ldc, bool relu, int32_t *bias, - bool addOnRow = false); - - // 8 bits int pack function - void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, - int32_t lda, int8_t *buffer); - void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, - int32_t lda, int8_t *buffer); - void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, - int32_t ldb, int8_t *buffer); - void PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, - int32_t lda, int8_t *buffer); - void PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, - int32_t ldb, int8_t *buffer); - void PackMatrixB_4c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, - int32_t ldb, int8_t *buffer); - void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, - int32_t lda, int8_t *buffer); - void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, - int32_t ldb, int8_t *buffer); - void PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail, - const int8_t *A, int32_t lda, int8_t *buffer); - void PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer); - void PackMatrixB_omp_4c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer); - - // 8 bits int matrix product - template - void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A, - int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C, - int32_t ldc, bool relu, Btype *bias, bool addOnRow = false); - template - void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, - int32_t lda, const int8_t *B, int32_t ldb, float beta, - Otype *C, int32_t ldc, bool relu, int32_t *bias, - bool addOnRow = false); - template - void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A, - int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C, - int32_t ldc, bool relu, Btype *bias, bool addOnRow = false); - template - void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, - int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C, - int32_t ldc, bool relu, int32_t *bias, bool addOnRow = false); - // 8 bits int write back - // C = A * B - void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc); - // C = A * B + bias, scale * relu(C) - void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale); - // C = A * B + bias, scale * C, bias is added on column - void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale); - // C = A * B + bias, scale * C, bias is added on row - void WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale); - - private: - int MC = 0; - int KC = 0; - int NC = 0; - - // 32位 float - float *packedA; - float *packedB; - float *packedC; - - // 8 bits int - int8_t *packedA_int8; - int8_t *packedB_int8; - int32_t *packedC_int32; - int8_t *zero_int8; -}; - -// 8 bits int matrix product (m*k x k*n) -template -void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, - int32_t lda, const int8_t *B, int32_t ldb, float beta, - Otype *C, int32_t ldc, bool relu, int32_t *bias, - bool addOnRow) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int32_t L1 = 32 * 1024; - int32_t L2 = 512 * 1024; - - const int32_t k_complete = (k + 15) - ((k + 15) & 15); - KC = k_complete; - MC = L1 / (KC * sizeof(int8_t)); - NC = L2 / (KC * sizeof(int8_t)); - - // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8 - if (MC == 0) { - MC = MR_INT8; - } else { - int32_t mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8; - } - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - if (NC == 0) { - NC = NR_INT8; - } else { - int32_t nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8; - } - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - packedA_int8 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC)); - packedB_int8 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC)); - packedC_int32 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC)); - zero_int8 = - static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * k)); - - memset(static_cast(zero_int8), 0, sizeof(int8_t) * k); - int32_t mc, nc; - for (int32_t j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); -#if __aarch64__ - PackMatrixB_4c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8); -#else - PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8); -#endif - for (int32_t i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); - PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8); - if (bias == nullptr) { - InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta, - packedC_int32, &C(i, j), ldc, relu); - } else { - if (addOnRow) { - InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, - packedC_int32, &C(i, j), ldc, relu, bias + j, - addOnRow); - } else { - InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, - packedC_int32, &C(i, j), ldc, relu, bias + i, - addOnRow); - } - } - } - } - - paddle_mobile::memory::Free(packedA_int8); - paddle_mobile::memory::Free(packedB_int8); - paddle_mobile::memory::Free(packedC_int32); - paddle_mobile::memory::Free(zero_int8); -} - -// 8 bits int matrix product (m*k x k*n), omp version -template -void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, - const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, - float beta, Otype *C, int32_t ldc, bool relu, - int32_t *bias, bool addOnRow) { -#ifdef _OPENMP - int32_t max_threads = omp_get_max_threads(); -#else - int32_t max_threads = 1; -#endif - - int32_t L1 = 64 / max_threads * 1024; - const int32_t k_complete = (k + 15) - ((k + 15) & 15); - KC = k_complete; - zero_int8 = - static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * k)); - memset(static_cast(zero_int8), 0, sizeof(int8_t) * k); - if (m > n) { - // 对 A 分块 - MC = L1 / (KC * sizeof(int8_t)); - if (MC == 0) { - MC = MR_INT8; - } else { - int32_t mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8; - } - // 补齐 B - NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8; - - packedB_int8 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC)); -#if __aarch64__ - PackMatrixB_omp_4c_16(k, n, n % NR_INT8, B, ldb, packedB_int8); -#else - PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8); -#endif - packedA_int8 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads)); - } else { - // 对 B 分块 - NC = L1 / (KC * sizeof(int8_t)); - if (NC == 0) { - NC = NR_INT8; - } else { - int32_t nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8; - } - // 补齐 A - MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8; - - packedA_int8 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC)); -#if __aarch64__ - PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8); -#else - PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8); -#endif - packedB_int8 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads)); - } - packedC_int32 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads)); - - if (m > n) { -#pragma omp parallel for - for (int32_t i = 0; i < m; i += MC) { -#ifdef _OPENMP - int32_t local_threads = omp_get_thread_num(); -#else - int32_t local_threads = 0; -#endif - - int32_t mc; - mc = s_min(m - i, MC); - int8_t *local_A = packedA_int8 + MC * KC * local_threads; - int32_t *local_C = packedC_int32 + MC * NC * local_threads; -#if __aarch64__ - PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A); -#else - PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A); -#endif - if (bias == nullptr) { - InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C, - &C(i, 0), ldc, relu); - } else { - if (addOnRow) { - InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, - local_C, &C(i, 0), ldc, relu, bias, addOnRow); - } else { - InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, - local_C, &C(i, 0), ldc, relu, bias + i, addOnRow); - } - } - } - } else { -#pragma omp parallel for - for (int32_t j = 0; j < n; j += NC) { -#ifdef _OPENMP - int32_t local_threads = omp_get_thread_num(); -#else - int32_t local_threads = 0; -#endif - int32_t nc; - nc = s_min(n - j, NC); - int8_t *local_B = packedB_int8 + KC * NC * local_threads; - int32_t *local_C = packedC_int32 + MC * NC * local_threads; -#if __aarch64__ - PackMatrixB_4c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B); -#else - PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B); -#endif - if (bias == nullptr) { - InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C, - &C(0, j), ldc, relu); - } else { - if (addOnRow) { - InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, - local_C, &C(0, j), ldc, relu, bias + j, addOnRow); - } else { - InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, - local_C, &C(0, j), ldc, relu, bias, addOnRow); - } - } - } - } - - paddle_mobile::memory::Free(packedA_int8); - paddle_mobile::memory::Free(packedB_int8); - paddle_mobile::memory::Free(packedC_int32); - paddle_mobile::memory::Free(zero_int8); -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/gemm/cblas.cc b/mobile/src/operators/math/gemm/cblas.cc deleted file mode 100644 index 4428826552..0000000000 --- a/mobile/src/operators/math/gemm/cblas.cc +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#include "operators/math/gemm/cblas.h" -#include "operators/math/gemm/executor.h" -#include "operators/math/gemm/strategy.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -void cblas_sgemm(const bool transA, const bool transB, const int M, const int N, - const int K, const float alpha, const float *A, const int lda, - const float *B, const int ldb, const float beta, float *C, - const int ldc) { - if (N == 1) { - return cblas_sgemv(transA, M, K, alpha, A, lda, B, beta, C); - } else if (M == 1) { - return cblas_sgemv(!transB, N, K, alpha, B, ldb, A, beta, C); - } else { - GemmExecutor exec(transA, transB, M, N, K); - exec(alpha, A, lda, B, ldb, beta, C, ldc); - } -} - -void cblas_sgemv(const bool trans, const int M, const int N, const float alpha, - const float *A, const int lda, const float *B, - const float beta, float *C) { - GemvExecutor exec(trans, M, N); - exec(alpha, A, lda, B, beta, C); -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/gemm/cblas.h b/mobile/src/operators/math/gemm/cblas.h deleted file mode 100644 index c7c9201869..0000000000 --- a/mobile/src/operators/math/gemm/cblas.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace operators { -namespace math { - -void cblas_sgemm(const bool transA, const bool transB, const int M, const int N, - const int K, const float alpha, const float *A, const int lda, - const float *B, const int ldb, const float beta, float *C, - const int ldc); - -void cblas_sgemv(const bool trans, const int M, const int N, const float alpha, - const float *A, const int lda, const float *B, - const float beta, float *C); - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/gemm/executor.h b/mobile/src/operators/math/gemm/executor.h deleted file mode 100644 index 976415b9ac..0000000000 --- a/mobile/src/operators/math/gemm/executor.h +++ /dev/null @@ -1,266 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#ifdef _OPENMP -#include -#endif -// #include -#include -#include "common/log.h" -#include "framework/context.h" -#include "memory/t_malloc.h" -#include "operators/math/gemm/gemm_kernel.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -int CeilDiv(const int &x, const int &y) { return (x + y - 1) / y; } -unsigned int ResetL1Cache(const unsigned int L1_size, const int thread_num, - const int N, const int K) { - unsigned int L1 = L1_size; - if (thread_num == 1) { - if (N >= 30000 && K > 100) { - L1 *= 4; - } else if (N >= 10000 && K > 100) { - L1 *= 2; - } - } - return L1; -} - -class Executor { - public: - Executor() : num_threads_(1) { -#ifdef _OPENMP - num_threads_ = omp_get_max_threads(); -#endif - } - virtual ~Executor() {} - - protected: - int num_threads_; -}; - -template -class GemmExecutor : public Executor { - typedef typename Strategy::Itype Itype; - typedef typename Strategy::Otype Otype; - - public: - GemmExecutor(const bool transA, const bool transB, const int M, const int N, - const int K) - : Executor(), transA_(transA), transB_(transB), M_(M), N_(N), K_(K) { - unsigned int L1_size = 0; - unsigned int L2_size = 0; - if (M_ > N_) { - L2_size = - ResetL1Cache(framework::CPUContext::Context()->get_l1_cache_size(), - num_threads_, M_, K_); - L1_size = framework::CPUContext::Context()->get_l2_cache_size(); - } else { - L1_size = - ResetL1Cache(framework::CPUContext::Context()->get_l1_cache_size(), - num_threads_, N_, K_); - L2_size = framework::CPUContext::Context()->get_l2_cache_size(); - } - - rhs_tile_num_ = L1_size / (K_ * sizeof(Itype)); - if (rhs_tile_num_ == 0) { - rhs_tile_num_ = Strategy::out_width(); - } else { - int n_block = CeilDiv(N_, rhs_tile_num_); - rhs_tile_num_ = CeilDiv(N_, n_block); - rhs_tile_num_ = CeilDiv(rhs_tile_num_, Strategy::out_width()); - rhs_tile_num_ *= Strategy::out_width(); - } - - // lhs_tile_num_ = CeilDiv(M, Strategy::out_height()) * - // Strategy::out_height(); - lhs_tile_num_ = L2_size / (K_ * sizeof(Itype)); - if (lhs_tile_num_ == 0) { - lhs_tile_num_ = Strategy::out_height(); - } else { - int m_block = CeilDiv(M_, lhs_tile_num_); - lhs_tile_num_ = CeilDiv(M_, m_block); - lhs_tile_num_ = CeilDiv(lhs_tile_num_, Strategy::out_height()); - lhs_tile_num_ *= Strategy::out_height(); - } - } - - void operator()(const float alpha, const Itype *A, const int lda, - const Itype *B, const int ldb, const float beta, Otype *C, - const int ldc) { - // struct timeval tv_begin, tv_end; - // gettimeofday(&tv_begin,NULL); - if (M_ > N_) { - nblock = CeilDiv(N_, Strategy::out_width()) * Strategy::out_width(); - lhs_worksize_ = sizeof(Itype) * lhs_tile_num_ * K_ * num_threads_; - rhs_worksize_ = sizeof(Itype) * K_ * nblock; - out_worksize_ = sizeof(Otype) * lhs_tile_num_ * nblock * num_threads_; - ldc_ = nblock; - } else { - mblock = CeilDiv(M_, Strategy::out_height()) * Strategy::out_height(); - lhs_worksize_ = sizeof(Itype) * mblock * K_; - rhs_worksize_ = sizeof(Itype) * K_ * rhs_tile_num_ * num_threads_; - out_worksize_ = sizeof(Otype) * mblock * rhs_tile_num_ * num_threads_; - ldc_ = rhs_tile_num_; - } - - lhs_workspace_ = - static_cast(paddle_mobile::memory::Alloc(lhs_worksize_)); - rhs_workspace_ = - static_cast(paddle_mobile::memory::Alloc(rhs_worksize_)); - out_workspace_ = - static_cast(paddle_mobile::memory::Alloc(out_worksize_)); - - // std::cout << "M: " << M_ << ", N: " << N_ << ", K: " << K_ << std::endl; - // std::cout << "lhs_block: " << CeilDiv(M_, lhs_tile_num_) << ", " - // << "rhs_block: " << CeilDiv(N_, rhs_tile_num_) << std::endl; - - if (M_ > N_) { - strategy_.pack_rhs(K_, N_, B, ldb, rhs_workspace_, true); - - #pragma omp parallel for - for (int lhs_block = 0; lhs_block < M_; lhs_block += lhs_tile_num_) { - int lhs_range = std::min(M_ - lhs_block, lhs_tile_num_); -#ifdef _OPENMP - int thread_id = omp_get_thread_num(); -#else - int thread_id = 0; -#endif - float *local_A = lhs_workspace_ + lhs_tile_num_ * K_ * thread_id; - float *local_C = out_workspace_ + lhs_tile_num_ * ldc_ * thread_id; - // load lhs into lhs_workspace - strategy_.pack_lhs(lhs_range, K_, A + lhs_block * lda, lda, local_A, - false); - for (int rhs_block = 0; rhs_block < N_; rhs_block += rhs_tile_num_) { - int rhs_range = std::min(N_ - rhs_block, rhs_tile_num_); - float *local_B = rhs_workspace_ + K_ * rhs_block; - for (int rhs_tile = 0; rhs_tile < rhs_range; - rhs_tile += Strategy::out_width()) { - for (int lhs_tile = 0; lhs_tile < lhs_range; - lhs_tile += Strategy::out_height()) { - int offset = lhs_tile * ldc_ + rhs_block + rhs_tile; - strategy_.kernel(local_A + lhs_tile * K_, local_B + rhs_tile * K_, - K_, local_C + offset, ldc_); - } - } - } - strategy_.write(lhs_range, N_, alpha, local_C, ldc_, beta, - C + lhs_block * ldc, ldc); - } - } else { - strategy_.pack_lhs(M_, K_, A, lda, lhs_workspace_, true); - - #pragma omp parallel for - for (int rhs_block = 0; rhs_block < N_; rhs_block += rhs_tile_num_) { - int rhs_range = std::min(N_ - rhs_block, rhs_tile_num_); -#ifdef _OPENMP - int thread_id = omp_get_thread_num(); -#else - int thread_id = 0; -#endif - float *local_B = rhs_workspace_ + K_ * rhs_tile_num_ * thread_id; - float *local_C = out_workspace_ + mblock * ldc_ * thread_id; - // load rhs into rhs_workspace - strategy_.pack_rhs(K_, rhs_range, B + rhs_block, ldb, local_B, false); - for (int lhs_block = 0; lhs_block < M_; lhs_block += lhs_tile_num_) { - int lhs_range = std::min(M_ - lhs_block, lhs_tile_num_); - float *local_A = lhs_workspace_ + lhs_block * K_; - for (int lhs_tile = 0; lhs_tile < lhs_range; - lhs_tile += Strategy::out_height()) { - for (int rhs_tile = 0; rhs_tile < rhs_range; - rhs_tile += Strategy::out_width()) { - int offset = (lhs_block + lhs_tile) * ldc_ + rhs_tile; - strategy_.kernel(local_A + lhs_tile * K_, local_B + rhs_tile * K_, - K_, local_C + offset, ldc_); - } - } - } - strategy_.write(M_, rhs_range, alpha, local_C, ldc_, beta, - C + rhs_block, ldc); - } - } - - paddle_mobile::memory::Free(lhs_workspace_); - paddle_mobile::memory::Free(rhs_workspace_); - paddle_mobile::memory::Free(out_workspace_); - - // gettimeofday(&tv_end,NULL); - // float elapsed = (tv_end.tv_sec - tv_begin.tv_sec) * 1000.f + - // (tv_end.tv_usec - tv_begin.tv_usec) / 1000.f; - // std::cout << "elapsed: " << elapsed << "ms, speed: " - // << (M_ * N_ * K_ / 1000.f / 1000.f) / elapsed - // << " gflops" << std::endl; - } - - virtual ~GemmExecutor() {} - - private: - const unsigned int M_; - const unsigned int N_; - const unsigned int K_; - const bool transA_; - const bool transB_; - - unsigned int lhs_tile_num_ = 0; - unsigned int rhs_tile_num_ = 0; - unsigned int out_tile_num_ = 0; - - unsigned int lhs_worksize_ = 0; - unsigned int rhs_worksize_ = 0; - unsigned int out_worksize_ = 0; - unsigned int ldc_ = 0; - - unsigned int mblock = 0; - unsigned int nblock = 0; - - Itype *lhs_workspace_ = nullptr; - Itype *rhs_workspace_ = nullptr; - Otype *out_workspace_ = nullptr; - - Strategy strategy_; -}; - -template -class GemvExecutor : public Executor { - typedef typename Strategy::Itype Itype; - typedef typename Strategy::Otype Otype; - - public: - GemvExecutor(const bool transA, const int M, const int N) - : Executor(), M_(M), N_(N), trans_(transA) {} - - void operator()(const float alpha, const Itype *A, const int lda, - const Itype *B, const float beta, Otype *C) { - strategy_.kernel(trans_, M_, N_, alpha, A, lda, B, beta, C); - } - - virtual ~GemvExecutor() {} - - private: - const unsigned int M_; - const unsigned int N_; - const bool trans_; - - Strategy strategy_; -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/gemm/gemm1x1s1.cpp b/mobile/src/operators/math/gemm/gemm1x1s1.cpp deleted file mode 100644 index 2fd78fa189..0000000000 --- a/mobile/src/operators/math/gemm/gemm1x1s1.cpp +++ /dev/null @@ -1,2223 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#ifdef CONV_OP - -#include "operators/math/gemm/gemm1x1s1.h" -#include -#include "framework/context.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -#ifdef __aarch64__ -void prepackA_8x12(float *out, const float *in, const int ldin, const int m0, - const int mmax, const int k0, const int kmax) { - int x_len = kmax - k0; - uint32_t zerobuff[x_len]; - memset(zerobuff, 0, sizeof(uint32_t) * x_len); - - uint32_t *dout = reinterpret_cast(out); - const uint32_t *inptr = reinterpret_cast(in); - int stride = x_len * 8; - -#pragma omp parallel for - for (int y = m0; y < mmax; y += 8) { - uint32_t *outptr = dout + stride * (y - m0) / 8; - - const uint32_t *inptr0 = inptr + y * ldin + k0; - const uint32_t *inptr1 = inptr0 + ldin; - const uint32_t *inptr2 = inptr1 + ldin; - const uint32_t *inptr3 = inptr2 + ldin; - const uint32_t *inptr4 = inptr3 + ldin; - const uint32_t *inptr5 = inptr4 + ldin; - const uint32_t *inptr6 = inptr5 + ldin; - const uint32_t *inptr7 = inptr6 + ldin; - - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr0], #64] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr1], #64] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr2], #64] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr3], #64] \n" - "prfm pldl1keep, [%[ptr4]] \n" - "prfm pldl1keep, [%[ptr4], #64] \n" - "prfm pldl1keep, [%[ptr5]] \n" - "prfm pldl1keep, [%[ptr5], #64] \n" - "prfm pldl1keep, [%[ptr6]] \n" - "prfm pldl1keep, [%[ptr6], #64] \n" - "prfm pldl1keep, [%[ptr7]] \n" - "prfm pldl1keep, [%[ptr7], #64] \n" - : - : [ptr0] "r"(inptr0), [ptr1] "r"(inptr1), [ptr2] "r"(inptr2), - [ptr3] "r"(inptr3), [ptr4] "r"(inptr4), [ptr5] "r"(inptr5), - [ptr6] "r"(inptr6), [ptr7] "r"(inptr7) - : "memory"); - - int x = x_len; - //! cope with row index exceed real size, set to zero buffer - if ((y + 7) >= mmax) { - switch ((y + 7) - mmax) { - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - default: - break; - } - } - for (; x > 7; x -= 8) { - asm volatile( - // Load up 8 elements (2 vectors) from each of 8 sources. - "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3 - "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3 - "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3 - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 - "prfm pldl1keep, [%[inptr0], #128] \n" - "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3 - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDP q8, q9, [%[inptr4]], #32\n" - "LDP q10, q11, [%[inptr5]], #32\n" - "LDP q12, q13, [%[inptr6]], #32\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr1], #128]\n" - "LDP q14, q15, [%[inptr7]], #32\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" - - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - "prfm pldl1keep, [%[inptr2], #128]\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - "prfm pldl1keep, [%[inptr3], #128]\n" - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first - // element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second - // element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr4], #128]\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr5], #128]\n" - "ZIP1 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr6], #128]\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element - - "ZIP2 v18.4s, v9.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr7], #128]\n" - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element - - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), - [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), - [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", - "v20", "v21", "v22", "v23", "cc", "memory"); - } - - for (; x > 0; x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#else //__aarch64__ -void prepackA_6x8(float* out, const float* in, const int ldin, const int m0, - const int mmax, const int k0, const int kmax) { - int x_len = kmax - k0; - uint32_t zerobuff[x_len]; - memset(zerobuff, 0, sizeof(uint32_t) * x_len); - - uint32_t* dout = reinterpret_cast(out); - const uint32_t* inptr = reinterpret_cast(in); - uint32_t* outptr = dout; - - //! data A is not transposed, transpose A to k * 6 - for (int y = m0; y < mmax; y += 6) { - const uint32_t* inptr0 = inptr + y * ldin + k0; - const uint32_t* inptr1 = inptr0 + ldin; - const uint32_t* inptr2 = inptr1 + ldin; - const uint32_t* inptr3 = inptr2 + ldin; - const uint32_t* inptr4 = inptr3 + ldin; - const uint32_t* inptr5 = inptr4 + ldin; - - int x = x_len; - //! cope with row index exceed real size, set to zero buffer - if ((y + 5) >= mmax) { - switch ((y + 5) - mmax) { - case 4: - inptr1 = zerobuff; - case 3: - inptr2 = zerobuff; - case 2: - inptr3 = zerobuff; - case 1: - inptr4 = zerobuff; - case 0: - inptr5 = zerobuff; - default: - break; - } - } - - for (; x > 7; x -= 8) { - //! zip load 8 elements (2 neon Q registers) from each of 6 rows - asm volatile( - "vld4.32 {d0-d3}, [%[inptr0]]! @ zip load r0, " - "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n" - "vld4.32 {d4-d7}, [%[inptr1]]! @ zip load r1, " - "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n" - "vld4.32 {d8-d11}, [%[inptr2]]! @ zip load r2, " - "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n" - "vld4.32 {d12-d15}, [%[inptr3]]! @ zip load r3, " - "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n" - "vld4.32 {d16-d19}, [%[inptr4]]! @ zip load r4, " - "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n" - "vld4.32 {d20-d23}, [%[inptr5]]! @ zip load r5, " - "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n" - - "vtrn.32 q0, q2 @ trans data: q0=r00,r10,r01,r11; " - "q2=r04,r14,r05,r15\n" - "vtrn.32 q4, q6 @ trans data: q4=r20,r30,r21,r31; " - "q6=r24,r34,r25,r35\n" - "vtrn.32 q8, q10 @ trans data: q8=r40,r50,r41,r51; " - "q10=r44,r54,r45,r55\n" - - "vswp d1, d8 @ swap d1, d8, q0=r00,r10,r20,r30; " - "q4=r01,r11,r21,r31\n" - "vst1.32 {d0-d1}, [%[outptr]]! @ write q0:r00,r10,r20,r30\n" - "vst1.32 {d16}, [%[outptr]]! @ write d16(q8,low),r40,r50\n" - "vst1.32 {d8-d9}, [%[outptr]]! @ write q4:r01,r11,r21,r31\n" - "vst1.32 {d17}, [%[outptr]]! @ write d16(q8,high),r41,r51\n" - - "vtrn.32 q1, q3 @ trans data: q1=r02,r12,r03,r13; " - "q3=r06,r16,r07,r17\n" - "vtrn.32 q5, q7 @ trans data: q5=r22,r32,r23,r33; " - "q7=r26,r36,r27,r37\n" - "vtrn.32 q9, q11 @ trans data: q9=r42,r52,r43,r53; " - "q11=r46,r56,r47,r57\n" - - "vswp d3, d10 @ swap d3, d10, " - "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n" - "vst1.32 {d2-d3}, [%[outptr]]! @ write q1:r02,r12,r22,r32\n" - "vst1.32 {d18}, [%[outptr]]! @ write d18(q9,low),r42,r52\n" - "vst1.32 {d10-d11},[%[outptr]]! @ write q5:r03,r13,r23,r33\n" - "vst1.32 {d19}, [%[outptr]]! @ write d19(q9,high),r43,r53\n" - - "vswp d5, d12 @ swap d5, d12,q2=r04,r14,r24,r34; " - "q6=r05,r15,r25,r35\n" - "vst1.32 {d4-d5}, [%[outptr]]! @ write q2:r04,r14,r24,r34\n" - "vst1.32 {d20}, [%[outptr]]! @ write d20(q10,low),r44,r54\n" - "vst1.32 {d12-d13},[%[outptr]]! @ write q6:r05,r15,r25,r35\n" - "vst1.32 {d21}, [%[outptr]]! @ write d21(q10,high),r45,r55\n" - - "vswp d7, d14 @ swap d7, d14, " - "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n" - "vst1.32 {d6-d7}, [%[outptr]]! @ write q3:r06,r16,r26,r36\n" - "vst1.32 {d22}, [%[outptr]]! @ write d22(q11,low),r46,r56\n" - "vst1.32 {d14-d15},[%[outptr]]! @ write q7:r07,r17,r27,r37\n" - "vst1.32 {d23}, [%[outptr]]! @ write d23(q11,high),r47,r57\n" - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), - [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), - [outptr] "+r"(outptr) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "cc", "memory"); - } - - for (; x > 0; x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - } - } -} - -void prepackA_4x8(float* out, const float* in, const int ldin, const int m0, - const int mmax, const int k0, const int kmax) { - int x_len = kmax - k0; - uint32_t zerobuff[x_len]; - memset(zerobuff, 0, sizeof(uint32_t) * x_len); - - uint32_t* dout = reinterpret_cast(out); - const uint32_t* inptr = reinterpret_cast(in); - - uint32_t* outptr = dout; - //! data A is not transposed, transpose A to k * 4 - for (int y = m0; y < mmax; y += 4) { - const uint32_t* inptr0 = inptr + y * ldin + k0; - const uint32_t* inptr1 = inptr0 + ldin; - const uint32_t* inptr2 = inptr1 + ldin; - const uint32_t* inptr3 = inptr2 + ldin; - - int x = x_len; - //! cope with row index exceed real size, set to zero buffer - if ((y + 3) >= mmax) { - switch ((y + 3) - mmax) { - case 2: - inptr1 = zerobuff; - case 1: - inptr2 = zerobuff; - case 0: - inptr3 = zerobuff; - default: - break; - } - } - - for (; x > 7; x -= 8) { - //! zip load 8 elements (2 neon Q registers) from each of 4 rows - asm volatile( - "vld4.32 {d0-d3}, [%[inptr0]]! @ zip load r0, " - "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n" - "vld4.32 {d4-d7}, [%[inptr1]]! @ zip load r1, " - "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n" - "vld4.32 {d8-d11}, [%[inptr2]]! @ zip load r2, " - "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n" - "vld4.32 {d12-d15}, [%[inptr3]]! @ zip load r3, " - "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n" - - "vtrn.32 q0, q2 @ trans data: q0=r00,r10,r01,r11; " - "q2=r04,r14,r05,r15\n" - "vtrn.32 q4, q6 @ trans data: q4=r20,r30,r21,r31; " - "q6=r24,r34,r25,r35\n" - - "vswp d1, d8 @ swap d1, d8, q0=r00,r10,r20,r30; " - "q4=r01,r11,r21,r31\n" - "vst1.32 {d0-d1}, [%[outptr]]! @ write q0:r00,r10,r20,r30\n" - "vst1.32 {d8-d9}, [%[outptr]]! @ write q4:r01,r11,r21,r31\n" - - "vtrn.32 q1, q3 @ trans data: q1=r02,r12,r03,r13; " - "q3=r06,r16,r07,r17\n" - "vtrn.32 q5, q7 @ trans data: q5=r22,r32,r23,r33; " - "q7=r26,r36,r27,r37\n" - - "vswp d3, d10 @ swap d3, d10, " - "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n" - "vst1.32 {d2-d3}, [%[outptr]]! @ write q1:r02,r12,r22,r32\n" - "vst1.32 {d10-d11},[%[outptr]]! @ write q5:r03,r13,r23,r33\n" - - "vswp d5, d12 @ swap d5, d12,q2=r04,r14,r24,r34; " - "q6=r05,r15,r25,r35\n" - "vst1.32 {d4-d5}, [%[outptr]]! @ write q2:r04,r14,r24,r34\n" - "vst1.32 {d12-d13},[%[outptr]]! @ write q6:r05,r15,r25,r35\n" - - "vswp d7, d14 @ swap d7, d14, " - "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n" - "vst1.32 {d6-d7}, [%[outptr]]! @ write q3:r06,r16,r26,r36\n" - "vst1.32 {d14-d15},[%[outptr]]! @ write q7:r07,r17,r27,r37\n" - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), - [inptr3] "+r"(inptr3), [outptr] "+r"(outptr) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "cc", "memory"); - } - - for (; x > 0; x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - } - } -} -#endif //__aarch64__ - -void prepackA(float *out, const float *in, const int ldin, const int m0, - const int mmax, const int k0, const int kmax, bool is_trans, - ARMArch arch) { -#ifdef __aarch64__ - if (!is_trans) { - prepackA_8x12(out, in, ldin, m0, mmax, k0, kmax); - } -#else - if (arch == A73) { - if (!is_trans) { - prepackA_4x8(out, in, ldin, m0, mmax, k0, kmax); - } - } else { - if (!is_trans) { - prepackA_6x8(out, in, ldin, m0, mmax, k0, kmax); - } - } -#endif -} - -void gemm1x1s1_transform_weight(const framework::Tensor &weight, - const framework::Tensor &output, - framework::Tensor *trans_weight, - const int group, ARMArch arch) { - const int chout = weight.dims()[0]; - const int chin = weight.dims()[1]; - const int hout = output.dims()[2]; - const int wout = output.dims()[3]; - const int m = chout / group; - const int n = hout * wout; - const int k = chin / group; - - if (n > 1) { - int hblock = get_hblock(arch); - int m_roundup = hblock * ((m + hblock - 1) / hblock); - int weights_size_per_group = ((m_roundup * k + 15) / 16) * 16; - int weight_worksize = sizeof(float) * weights_size_per_group * group; - float *w_trans_ptr = trans_weight->mutable_data({weight_worksize}); - for (int g = 0; g < group; ++g) { - const float *weights_group = weight.data() + g * m * k; - float *weights_trans_ptr = w_trans_ptr + g * weights_size_per_group; - prepackA(weights_trans_ptr, weights_group, k, 0, m, 0, k, false, arch); - } - } -} - -#ifdef __aarch64__ -void loadb(float *out, const float *in, const int ldin, const int k0, - const int kmax, const int n0, const int nmax) { - uint32_t *outptr = reinterpret_cast(out); - const uint32_t *inptr = - reinterpret_cast(in) + k0 * ldin + n0; - uint32_t mask_buffer[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - int x_len = nmax - n0; - int y_len = kmax - k0; - int right_remain = x_len - 12 * (x_len / 12); - int right_pad = 12 - right_remain; - const size_t copy_len_remain = sizeof(float) * right_remain; - const size_t copy_len_pad = sizeof(float) * right_pad; - const size_t size_ldin = sizeof(float) * ldin; - - uint32_t *outptr_row = outptr; - int stride_out = 12 * y_len; - - uint32x4_t vzero = vdupq_n_u32(0); - uint32x4_t vmask1 = - vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); - uint32x4_t vmask2 = - vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain)); - uint32x4_t vmask3 = - vcltq_u32(vld1q_u32(mask_buffer + 8), vdupq_n_u32(right_remain)); - -#pragma omp parallel for - for (int y = 0; y < y_len - 3; y += 4) { - const uint32_t *ptr0 = inptr + y * ldin; - const uint32_t *ptr1 = ptr0 + ldin; - const uint32_t *ptr2 = ptr1 + ldin; - const uint32_t *ptr3 = ptr2 + ldin; - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr0], #64] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr1], #64] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr2], #64] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr3], #64] \n" - : - : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3) - : "memory"); - - uint32_t *outptr_row_col = outptr_row + y * 12; - - int i = 0; - for (; i < x_len - 11; i += 12) { - uint32x4_t vr00 = vld1q_u32(ptr0); - uint32x4_t vr01 = vld1q_u32(ptr0 + 4); - uint32x4_t vr02 = vld1q_u32(ptr0 + 8); - - uint32x4_t vr10 = vld1q_u32(ptr1); - uint32x4_t vr11 = vld1q_u32(ptr1 + 4); - uint32x4_t vr12 = vld1q_u32(ptr1 + 8); - - vst1q_u32(outptr_row_col, vr00); - vst1q_u32(outptr_row_col + 4, vr01); - vst1q_u32(outptr_row_col + 8, vr02); - - uint32x4_t vr20 = vld1q_u32(ptr2); - uint32x4_t vr21 = vld1q_u32(ptr2 + 4); - uint32x4_t vr22 = vld1q_u32(ptr2 + 8); - - vst1q_u32(outptr_row_col + 12, vr10); - vst1q_u32(outptr_row_col + 16, vr11); - vst1q_u32(outptr_row_col + 20, vr12); - - uint32x4_t vr30 = vld1q_u32(ptr3); - uint32x4_t vr31 = vld1q_u32(ptr3 + 4); - uint32x4_t vr32 = vld1q_u32(ptr3 + 8); - - vst1q_u32(outptr_row_col + 24, vr20); - vst1q_u32(outptr_row_col + 28, vr21); - vst1q_u32(outptr_row_col + 32, vr22); - - vst1q_u32(outptr_row_col + 36, vr30); - vst1q_u32(outptr_row_col + 40, vr31); - vst1q_u32(outptr_row_col + 44, vr32); - - ptr0 += 12; - ptr1 += 12; - ptr2 += 12; - ptr3 += 12; - - outptr_row_col += stride_out; - } - if (right_remain > 0) { - uint32x4_t vr00 = vld1q_u32(ptr0); - uint32x4_t vr01 = vld1q_u32(ptr0 + 4); - uint32x4_t vr02 = vld1q_u32(ptr0 + 8); - - uint32x4_t vr10 = vld1q_u32(ptr1); - uint32x4_t vr11 = vld1q_u32(ptr1 + 4); - uint32x4_t vr12 = vld1q_u32(ptr1 + 8); - - uint32x4_t vr00_1 = vbslq_u32(vmask1, vr00, vzero); - uint32x4_t vr01_1 = vbslq_u32(vmask2, vr01, vzero); - uint32x4_t vr02_1 = vbslq_u32(vmask3, vr02, vzero); - - uint32x4_t vr20 = vld1q_u32(ptr2); - uint32x4_t vr21 = vld1q_u32(ptr2 + 4); - uint32x4_t vr22 = vld1q_u32(ptr2 + 8); - - vst1q_u32(outptr_row_col, vr00_1); - vst1q_u32(outptr_row_col + 4, vr01_1); - vst1q_u32(outptr_row_col + 8, vr02_1); - - uint32x4_t vr10_1 = vbslq_u32(vmask1, vr10, vzero); - uint32x4_t vr11_1 = vbslq_u32(vmask2, vr11, vzero); - uint32x4_t vr12_1 = vbslq_u32(vmask3, vr12, vzero); - - uint32x4_t vr30 = vld1q_u32(ptr3); - uint32x4_t vr31 = vld1q_u32(ptr3 + 4); - uint32x4_t vr32 = vld1q_u32(ptr3 + 8); - - vst1q_u32(outptr_row_col + 12, vr10_1); - vst1q_u32(outptr_row_col + 16, vr11_1); - vst1q_u32(outptr_row_col + 20, vr12_1); - - uint32x4_t vr20_1 = vbslq_u32(vmask1, vr20, vzero); - uint32x4_t vr21_1 = vbslq_u32(vmask2, vr21, vzero); - uint32x4_t vr22_1 = vbslq_u32(vmask3, vr22, vzero); - - uint32x4_t vr30_1 = vbslq_u32(vmask1, vr30, vzero); - uint32x4_t vr31_1 = vbslq_u32(vmask2, vr31, vzero); - uint32x4_t vr32_1 = vbslq_u32(vmask3, vr32, vzero); - - vst1q_u32(outptr_row_col + 24, vr20_1); - vst1q_u32(outptr_row_col + 28, vr21_1); - vst1q_u32(outptr_row_col + 32, vr22_1); - - vst1q_u32(outptr_row_col + 36, vr30_1); - vst1q_u32(outptr_row_col + 40, vr31_1); - vst1q_u32(outptr_row_col + 44, vr32_1); - } - } - -#pragma omp parallel for - for (int y = 4 * (y_len / 4); y < y_len; ++y) { - const uint32_t *ptr0 = inptr + y * ldin; - uint32_t *outptr_row_col = outptr_row + y * 12; - - int i = 0; - for (; i < x_len - 11; i += 12) { - uint32x4_t vr0 = vld1q_u32(ptr0); - uint32x4_t vr1 = vld1q_u32(ptr0 + 4); - uint32x4_t vr2 = vld1q_u32(ptr0 + 8); - vst1q_u32(outptr_row_col, vr0); - vst1q_u32(outptr_row_col + 4, vr1); - vst1q_u32(outptr_row_col + 8, vr2); - - ptr0 += 12; - - outptr_row_col += stride_out; - } - if (right_remain > 0) { - uint32x4_t vr0 = vld1q_u32(ptr0); - uint32x4_t vr1 = vld1q_u32(ptr0 + 4); - uint32x4_t vr2 = vld1q_u32(ptr0 + 8); - - uint32x4_t vr0_1 = vbslq_u32(vmask1, vr0, vzero); - uint32x4_t vr1_1 = vbslq_u32(vmask2, vr1, vzero); - uint32x4_t vr2_1 = vbslq_u32(vmask3, vr2, vzero); - - vst1q_u32(outptr_row_col, vr0_1); - vst1q_u32(outptr_row_col + 4, vr1_1); - vst1q_u32(outptr_row_col + 8, vr2_1); - } - } -} -#else //__aarch64__ -void loadb(float* out, const float* in, const int ldin, const int k0, - const int kmax, const int n0, const int nmax) { - uint32_t* outptr = reinterpret_cast(out); - const uint32_t* inptr = - reinterpret_cast(in) + k0 * ldin + n0; - uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - int x_len = nmax - n0; - int y_len = kmax - k0; - int right_remain = x_len - 8 * (x_len / 8); - int right_pad = 8 - right_remain; - const size_t copy_len_remain = sizeof(float) * right_remain; - const size_t copy_len_pad = sizeof(float) * right_pad; - const size_t size_ldin = sizeof(float) * ldin; - - uint32_t* outptr_row = outptr; - int stride_out = 8 * y_len; - - uint32x4_t vzero = vdupq_n_u32(0); - uint32x4_t vmask1 = - vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); - uint32x4_t vmask2 = - vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain)); - -#pragma omp parallel for - for (int y = 0; y < y_len - 3; y += 4) { - const uint32_t* ptr0 = inptr + y * ldin; - const uint32_t* ptr1 = ptr0 + ldin; - const uint32_t* ptr2 = ptr1 + ldin; - const uint32_t* ptr3 = ptr2 + ldin; - uint32_t* outptr_row_col = outptr_row + y * 8; - int i = 0; - for (; i < x_len - 7; i += 8) { - uint32_t* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" - "vld1.32 {d4-d7}, [%[ptr1]]! @ load r1, 8 elements\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" - - "vld1.32 {d0-d3}, [%[ptr2]]! @ load r2, 8 elements\n" - "vld1.32 {d4-d7}, [%[ptr3]]! @ load r3, 8 elements\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" - : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3) - : - : "q0", "q1", "q2", "q3", "cc", "memory"); - outptr_row_col += stride_out; - } - if (right_remain > 0) { - uint32_t* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" - "vld1.32 {d4-d7}, [%[ptr1]]! @ load r1, 8 elements\n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - //"vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" - - "vld1.32 {d0-d3}, [%[ptr2]]! @ load r2, 8 elements\n" - "vld1.32 {d4-d7}, [%[ptr3]]! @ load r3, 8 elements\n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - //"vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" - : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero) - : "q0", "q1", "q2", "q3", "cc", "memory"); - } - } -#pragma omp parallel for - for (int y = 4 * (y_len / 4); y < y_len; ++y) { - const uint32_t* ptr0 = inptr + y * ldin; - uint32_t* outptr_row_col = outptr_row + y * 8; - int i = 0; - for (; i < x_len - 7; i += 8) { - uint32_t* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out) - : - : "q0", "q1", "cc", "memory"); - outptr_row_col += stride_out; - } - if (right_remain > 0) { - uint32_t* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero) - : "q0", "q1", "cc", "memory"); - } - } -} -#endif //__aarch64__ - -#ifdef __aarch64__ -void sgemm_conv_8x12(const float *A_packed, const float *B, const float *bias, - float *C, int M, int N, int K, bool is_bias, bool is_relu, - bool transB) { - const int threads = framework::CPUContext::Context()->get_thread_num(); - int l2_size = - framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float); - int l2_cache = l2_size > 0 ? l2_size : 512 * 1024; - - //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 - int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK)); - x_block /= NBLOCK; - x_block *= NBLOCK; - int x_num = (N + (x_block - 1)) / x_block; - x_block = (N + x_num - 1) / x_num; - x_block = (x_block + NBLOCK - 1) / NBLOCK; - x_block *= NBLOCK; - x_block = x_block < NBLOCK ? NBLOCK : x_block; - - // unroll 2 loop - int tail_pre = (K & (KBLOCK - 1)); - int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; - - bool flag_p_remain = false; - int remain = 0; - - //! apanel is pre_compute outside gemm - for (unsigned int x0 = 0; x0 < N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - if (xmax > N) { - xmax = N; - } - int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK; - remain = xmax - x0 - (bblocks - 1) * NBLOCK; - if (remain > 0) { - flag_p_remain = true; - } - //! load bpanel - float *b_pannel = - static_cast(framework::CPUContext::Context()->get_work_space( - K * (xmax - x0) * sizeof(float))); - if (!transB) { - loadb(b_pannel, B, N, 0, K, x0, xmax); - } -#pragma omp parallel for num_threads(threads) - for (unsigned int y = 0; y < M; y += MBLOCK) { - unsigned int ymax = y + MBLOCK; - if (ymax > M) { - ymax = M; - } - - float bias_local[8] = {0}; - if (is_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; - bias_local[4] = bias[y + 4]; - bias_local[5] = bias[y + 5]; - bias_local[6] = bias[y + 6]; - bias_local[7] = bias[y + 7]; - } - - float cout0[NBLOCK]; - float cout1[NBLOCK]; - float cout2[NBLOCK]; - float cout3[NBLOCK]; - float cout4[NBLOCK]; - float cout5[NBLOCK]; - float cout6[NBLOCK]; - float cout7[NBLOCK]; - - float *c_ptr0 = C + y * N + x0; - float *c_ptr1 = c_ptr0 + N; - float *c_ptr2 = c_ptr1 + N; - float *c_ptr3 = c_ptr2 + N; - float *c_ptr4 = c_ptr3 + N; - float *c_ptr5 = c_ptr4 + N; - float *c_ptr6 = c_ptr5 + N; - float *c_ptr7 = c_ptr6 + N; - - float *pout0 = c_ptr0; - float *pout1 = c_ptr1; - float *pout2 = c_ptr2; - float *pout3 = c_ptr3; - float *pout4 = c_ptr4; - float *pout5 = c_ptr5; - float *pout6 = c_ptr6; - float *pout7 = c_ptr7; - - const float *a_ptr_l = A_packed + y * K; - const float *b_ptr = b_pannel; - for (int xb = 0; xb < bblocks; xb++) { - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - c_ptr1 = cout1; - case 5: - c_ptr2 = cout2; - case 4: - c_ptr3 = cout3; - case 3: - c_ptr4 = cout4; - case 2: - c_ptr5 = cout5; - case 1: - c_ptr6 = cout6; - case 0: - c_ptr7 = cout7; - default: - break; - } - } - if (flag_p_remain && (xb == bblocks - 1)) { - pout0 = c_ptr0; - pout1 = c_ptr1; - pout2 = c_ptr2; - pout3 = c_ptr3; - pout4 = c_ptr4; - pout5 = c_ptr5; - pout6 = c_ptr6; - pout7 = c_ptr7; - - c_ptr0 = cout0; - c_ptr1 = cout1; - c_ptr2 = cout2; - c_ptr3 = cout3; - c_ptr4 = cout4; - c_ptr5 = cout5; - c_ptr6 = cout6; - c_ptr7 = cout7; - } - const float *a_ptr = a_ptr_l; - int tail = tail_pre; - int k = k_pre; - - asm volatile( - // Initialize result registers, load initial operands, prime - // prefetches. - "ldp q2, q3, [%[bias_ptr]]\n" /* load bias to q2, q3*/ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00,a01 to q0, q1*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ - "dup v8.4s, v2.s[0]\n" /* out0 = 0 */ - "dup v9.4s, v2.s[0]\n" /* out1 = 0*/ - "dup v10.4s, v2.s[0]\n" /* out2 = 0*/ - "dup v11.4s, v2.s[1]\n" /* out3 = 0*/ - "dup v12.4s, v2.s[1]\n" /* out4 = 0*/ - "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ - "dup v13.4s, v2.s[1]\n" /* out5 = 0*/ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ - "dup v14.4s, v2.s[2]\n" /* out6 = 0*/ - "prfm pldl1keep, [%[b_ptr], #128]\n" /* preload b*/ - "dup v15.4s, v2.s[2]\n" /* out7 = 0*/ - "prfm pldl1keep, [%[a_ptr], #128]\n" /* preload a*/ - "dup v16.4s, v2.s[2]\n" /* out8 = 0*/ - "prfm pldl1keep, [%[b_ptr], #192]\n" /* preload b*/ - "dup v17.4s, v2.s[3]\n" /* out9 = 0*/ - "prfm pldl1keep, [%[b_ptr], #256]\n" /* preload b*/ - "dup v18.4s, v2.s[3]\n" /* out10 = 0*/ - "prfm pldl1keep, [%[a_ptr], #192]\n" /* preload a*/ - "dup v19.4s, v2.s[3]\n" /* out11 = 0*/ - "prfm pldl1keep, [%[b_ptr], #320]\n" /* preload b*/ - "dup v20.4s, v3.s[0]\n" /* out12 = 0*/ - "prfm pldl1keep, [%[a_ptr], #256]\n" /* preload a*/ - "dup v21.4s, v3.s[0]\n" /* out13 = 0*/ - "prfm pldl1keep, [%[b_ptr], #384]\n" /* preload b*/ - "dup v22.4s, v3.s[0]\n" /* out14 = 0*/ - "dup v23.4s, v3.s[1]\n" /* out15 = 0*/ - "dup v24.4s, v3.s[1]\n" /* out16 = 0*/ - "dup v25.4s, v3.s[1]\n" /* out17 = 0*/ - "dup v26.4s, v3.s[2]\n" /* out18 = 0*/ - "dup v27.4s, v3.s[2]\n" /* out19 = 0*/ - "dup v28.4s, v3.s[2]\n" /* out20 = 0*/ - "dup v29.4s, v3.s[3]\n" /* out21 = 0*/ - "dup v30.4s, v3.s[3]\n" /* out22 = 0*/ - "dup v31.4s, v3.s[3]\n" /* out23 = 0*/ - "cbz %w[k], 2f\n" /* check loop count > 0 */ - /* main loop */ - /* unrool 0*/ - "1:\n" /* main loop */ - "fmla v8.4s , v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 = q4 - */ - "fmla v11.4s , v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 = q4 - */ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7 */ - "fmla v14.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q4 - */ - "fmla v17.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q4 - */ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4 */ - "fmla v20.4s, v4.4s, v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q4 - */ - "fmla v23.4s, v4.4s, v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q4 - */ - "fmla v26.4s, v4.4s, v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q4 - */ - "fmla v29.4s, v4.4s, v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q4 - */ - - "fmla v9.4s, v5.4s, v0.s[0]\n" /* out8 = b1 * a00[0], b1 = q5 */ - "fmla v12.4s, v5.4s, v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q5 - */ - "fmla v15.4s, v5.4s, v0.s[2]\n" /* out10 = b1 * a00[2], b1 = - q5*/ - "fmla v18.4s, v5.4s, v0.s[3]\n" /* out11 = b1 * a00[3], b1 = - q5*/ - "fmla v21.4s, v5.4s, v1.s[0]\n" /* out12 = b1 * a01[0], b1 = - q5*/ - "fmla v24.4s, v5.4s, v1.s[1]\n" /* out13 = b1 * a01[1], b1 = - q5*/ - "fmla v27.4s, v5.4s, v1.s[2]\n" /* out14 = b1 * a01[2], b1 = - q5*/ - "fmla v30.4s, v5.4s, v1.s[3]\n" /* out15 = b1 * a01[3], b1 = - q5*/ - - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5 */ - - "fmla v10.4s, v6.4s, v0.s[0]\n" /* out16 = b2 * a00[0], b2 = - q6*/ - "fmla v13.4s, v6.4s, v0.s[1]\n" /* out17 = b2 * a00[1], b2 = - q6*/ - "prfm pldl1keep, [%[b_ptr], #384]\n" - "fmla v16.4s, v6.4s, v0.s[2]\n" /* out18 = b2 * a00[2], b2 = - q6*/ - "fmla v19.4s, v6.4s, v0.s[3]\n" /* out19 = b2 * a00[3], b2 = - q6*/ - "fmla v22.4s, v6.4s, v1.s[0]\n" /* out20 = b2 * a00[0], b2 = - q6*/ - "fmla v25.4s, v6.4s, v1.s[1]\n" /* out21 = b2 * a00[1], b2 = - q6*/ - "fmla v28.4s, v6.4s, v1.s[2]\n" /* out22 = b2 * a00[2], b2 = - q6*/ - "fmla v31.4s, v6.4s, v1.s[3]\n" /* out23 = b2 * a00[3], b2 = - q6*/ - - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1 */ - - /* unrool 1 */ - "fmla v8.4s , v7.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q7 - */ - "fmla v11.4s , v7.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 = q7 - */ - "fmla v14.4s, v7.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q7 - */ - "prfm pldl1keep, [%[a_ptr], #256]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q7 - */ - "fmla v20.4s, v7.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q7 - */ - "fmla v23.4s, v7.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7 - */ - "fmla v26.4s, v7.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q7 - */ - "fmla v29.4s, v7.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q7 - */ - - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7 */ - - "fmla v9.4s, v4.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 = q4 */ - "fmla v12.4s, v4.4s, v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q4 - */ - "fmla v15.4s, v4.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 = - q4*/ - "fmla v18.4s, v4.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 = - q4*/ - "fmla v21.4s, v4.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 = - q4*/ - "fmla v24.4s, v4.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 = - q4*/ - "fmla v27.4s, v4.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 = - q4*/ - "fmla v30.4s, v4.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 = - q4*/ - - "fmla v10.4s, v5.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 = - q5*/ - "fmla v13.4s, v5.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 = - q5*/ - "fmla v16.4s, v5.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 = - q5*/ - "fmla v19.4s, v5.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 = - q5*/ - "fmla v22.4s, v5.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 = - q5*/ - "fmla v25.4s, v5.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 = - q5*/ - "fmla v28.4s, v5.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 = - q5*/ - "fmla v31.4s, v5.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 = - q5*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5 */ - /* unrool 2*/ - "fmla v8.4s , v6.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 = q6 - */ - "fmla v11.4s , v6.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 = q6 - */ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ - "fmla v14.4s, v6.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q6*/ - "fmla v17.4s, v6.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q6*/ - "fmla v20.4s, v6.4s, v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q6*/ - "fmla v23.4s, v6.4s, v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q6*/ - "fmla v26.4s, v6.4s, v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q6*/ - "fmla v29.4s, v6.4s, v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q6*/ - "fmla v9.4s, v7.4s, v0.s[0]\n" /* out8 = b1 * a00[0], b1 = q7*/ - "fmla v12.4s, v7.4s, v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q7*/ - "prfm pldl1keep, [%[b_ptr], #384]\n" - "fmla v15.4s, v7.4s, v0.s[2]\n" /* out10 = b1 * a00[2], b1 = - q7*/ - "fmla v18.4s, v7.4s, v0.s[3]\n" /* out11 = b1 * a00[3], b1 = - q7*/ - "fmla v21.4s, v7.4s, v1.s[0]\n" /* out12 = b1 * a01[0], b1 = - q7*/ - "fmla v24.4s, v7.4s, v1.s[1]\n" /* out13 = b1 * a01[1], b1 = - q7*/ - "fmla v27.4s, v7.4s, v1.s[2]\n" /* out14 = b1 * a01[2], b1 = - q7*/ - "fmla v30.4s, v7.4s, v1.s[3]\n" /* out15 = b1 * a01[3], b1 = - q7*/ - - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ - - "fmla v10.4s, v4.4s, v0.s[0]\n" /* out16 = b2 * a00[0], b2 = - q4*/ - "fmla v13.4s, v4.4s, v0.s[1]\n" /* out17 = b2 * a00[1], b2 = - q4*/ - "fmla v16.4s, v4.4s, v0.s[2]\n" /* out18 = b2 * a00[2], b2 = - q4*/ - "fmla v19.4s, v4.4s, v0.s[3]\n" /* out19 = b2 * a00[3], b2 = - q4*/ - "fmla v22.4s, v4.4s, v1.s[0]\n" /* out20 = b2 * a00[0], b2 = - q4*/ - "fmla v25.4s, v4.4s, v1.s[1]\n" /* out21 = b2 * a00[1], b2 = - q4*/ - "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a00[2], b2 = - q4*/ - "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a00[3], b2 = - q4*/ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ - /* unrool 3*/ - "fmla v8.4s , v5.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ - "fmla v11.4s , v5.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 = - q5*/ - "fmla v14.4s, v5.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ - "fmla v17.4s, v5.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ - "fmla v20.4s, v5.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ - "fmla v23.4s, v5.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ - "fmla v26.4s, v5.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ - "fmla v29.4s, v5.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ - "fmla v9.4s, v6.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ - "fmla v12.4s, v6.4s, v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q6*/ - "prfm pldl1keep, [%[a_ptr], #256]\n" - "fmla v15.4s, v6.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 = - q6*/ - "fmla v18.4s, v6.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 = - q6*/ - "fmla v21.4s, v6.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 = - q6*/ - "fmla v24.4s, v6.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 = - q6*/ - "fmla v27.4s, v6.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 = - q6*/ - "prfm pldl1keep, [%[b_ptr], #384]\n" - "fmla v30.4s, v6.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 = - q6*/ - "fmla v10.4s, v7.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 = - q7*/ - "fmla v13.4s, v7.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 = - q7*/ - "fmla v16.4s, v7.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 = - q7*/ - "fmla v19.4s, v7.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 = - q7*/ - "fmla v22.4s, v7.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 = - q7*/ - "fmla v25.4s, v7.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 = - q7*/ - "subs %w[k], %w[k], #1\n" /* loop count - 1*/ - "fmla v28.4s, v7.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 = - q7*/ - "fmla v31.4s, v7.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 = - q7*/ - "bne 1b\n" - /* Target to use when K is 1 or 2 (i.e. zero iterations of main - loop)*/ - "2:\n" /* process tail*/ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ - "beq 3f\n" /*jump to tail = 1*/ - /* final unrool 0*/ - /* unrool 0, tail > 1*/ - "fmla v8.4s , v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 = q4*/ - "fmla v11.4s , v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 = - q4*/ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7*/ - "fmla v14.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q4*/ - "fmla v17.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q4*/ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q2, q3*/ - "fmla v20.4s, v4.4s, v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q4*/ - "fmla v23.4s, v4.4s, v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q4*/ - "fmla v26.4s, v4.4s, v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q4*/ - "fmla v29.4s, v4.4s, v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q4*/ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ - "fmla v9.4s, v5.4s, v0.s[0]\n" /* out8 = b1 * a00[0], b1 = q5*/ - "fmla v12.4s, v5.4s, v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q5*/ - "fmla v15.4s, v5.4s, v0.s[2]\n" /* out10 = b1 * a00[2], b1 = - q5*/ - "fmla v18.4s, v5.4s, v0.s[3]\n" /* out11 = b1 * a00[3], b1 = - q5*/ - "fmla v21.4s, v5.4s, v1.s[0]\n" /* out12 = b1 * a01[0], b1 = - q5*/ - "fmla v24.4s, v5.4s, v1.s[1]\n" /* out13 = b1 * a01[1], b1 = - q5*/ - "fmla v27.4s, v5.4s, v1.s[2]\n" /* out14 = b1 * a01[2], b1 = - q5*/ - "fmla v30.4s, v5.4s, v1.s[3]\n" /* out15 = b1 * a01[3], b1 = - q5*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5*/ - "fmla v10.4s, v6.4s, v0.s[0]\n" /* out16 = b2 * a00[0], b2 = - q6*/ - "fmla v13.4s, v6.4s, v0.s[1]\n" /* out17 = b2 * a00[1], b2 = - q6*/ - "fmla v16.4s, v6.4s, v0.s[2]\n" /* out18 = b2 * a00[2], b2 = - q6*/ - "fmla v19.4s, v6.4s, v0.s[3]\n" /* out19 = b2 * a00[3], b2 = - q6*/ - "fmla v22.4s, v6.4s, v1.s[0]\n" /* out20 = b2 * a00[0], b2 = - q6*/ - "fmla v25.4s, v6.4s, v1.s[1]\n" /* out21 = b2 * a00[1], b2 = - q6*/ - "fmla v28.4s, v6.4s, v1.s[2]\n" /* out22 = b2 * a00[2], b2 = - q6*/ - "fmla v31.4s, v6.4s, v1.s[3]\n" /* out23 = b2 * a00[3], b2 = - q6*/ - "beq 4f\n" /*jump to tail = 2*/ - /* unrool 1, tail > 2*/ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ - "fmla v8.4s , v7.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q7*/ - "fmla v11.4s , v7.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 = - q7*/ - "fmla v14.4s, v7.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q7*/ - "fmla v17.4s, v7.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q7*/ - "fmla v20.4s, v7.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q7*/ - "fmla v23.4s, v7.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7*/ - "fmla v26.4s, v7.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q7*/ - "fmla v29.4s, v7.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q7*/ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7*/ - "fmla v9.4s, v4.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 = q4*/ - "fmla v12.4s, v4.4s, v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q4*/ - "fmla v15.4s, v4.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 = - q4*/ - "fmla v18.4s, v4.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 = - q4*/ - "fmla v21.4s, v4.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 = - q4*/ - "fmla v24.4s, v4.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 = - q4*/ - "fmla v27.4s, v4.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 = - q4*/ - "fmla v30.4s, v4.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 = - q4*/ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ - "fmla v10.4s, v5.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 = - q5*/ - "fmla v13.4s, v5.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 = - q5*/ - "fmla v16.4s, v5.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 = - q5*/ - "fmla v19.4s, v5.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 = - q5*/ - "fmla v22.4s, v5.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 = - q5*/ - "fmla v25.4s, v5.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 = - q5*/ - "fmla v28.4s, v5.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 = - q5*/ - "fmla v31.4s, v5.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 = - q5*/ - "beq 5f\n" /*jump to tail = 3*/ - /* unrool 2, tail = 4*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5*/ - "fmla v8.4s , v6.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 = q6*/ - "fmla v11.4s , v6.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 = - q6*/ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ - "fmla v14.4s, v6.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q6*/ - "fmla v17.4s, v6.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q6*/ - "fmla v20.4s, v6.4s, v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q6*/ - "fmla v23.4s, v6.4s, v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q6*/ - "fmla v26.4s, v6.4s, v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q6*/ - "fmla v29.4s, v6.4s, v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q6*/ - "fmla v9.4s, v7.4s, v0.s[0]\n" /* out8 = b1 * a00[0], b1 = q7*/ - "fmla v12.4s, v7.4s, v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q7*/ - "fmla v15.4s, v7.4s, v0.s[2]\n" /* out10 = b1 * a00[2], b1 = - q7*/ - "fmla v18.4s, v7.4s, v0.s[3]\n" /* out11 = b1 * a00[3], b1 = - q7*/ - "fmla v21.4s, v7.4s, v1.s[0]\n" /* out12 = b1 * a01[0], b1 = - q7*/ - "fmla v24.4s, v7.4s, v1.s[1]\n" /* out13 = b1 * a01[1], b1 = - q7*/ - "fmla v27.4s, v7.4s, v1.s[2]\n" /* out14 = b1 * a01[2], b1 = - q7*/ - "fmla v30.4s, v7.4s, v1.s[3]\n" /* out15 = b1 * a01[3], b1 = - q7*/ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ - "fmla v10.4s, v4.4s, v0.s[0]\n" /* out16 = b2 * a00[0], b2 = - q4*/ - "fmla v13.4s, v4.4s, v0.s[1]\n" /* out17 = b2 * a00[1], b2 = - q4*/ - "fmla v16.4s, v4.4s, v0.s[2]\n" /* out18 = b2 * a00[2], b2 = - q4*/ - "fmla v19.4s, v4.4s, v0.s[3]\n" /* out19 = b2 * a00[3], b2 = - q4*/ - "fmla v22.4s, v4.4s, v1.s[0]\n" /* out20 = b2 * a00[0], b2 = - q4*/ - "fmla v25.4s, v4.4s, v1.s[1]\n" /* out21 = b2 * a00[1], b2 = - q4*/ - "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a00[2], b2 = - q4*/ - "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a00[3], b2 = - q4*/ - /* unrool 3, tail = 4*/ - "fmla v8.4s , v5.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ - "fmla v11.4s , v5.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 = - q5*/ - "fmla v14.4s, v5.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ - "fmla v17.4s, v5.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ - "fmla v20.4s, v5.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ - "fmla v23.4s, v5.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ - "fmla v26.4s, v5.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ - "fmla v29.4s, v5.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ - "fmla v9.4s, v6.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ - "fmla v12.4s, v6.4s, v2.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/ - "fmla v15.4s, v6.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 = - q6*/ - "fmla v18.4s, v6.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 = - q6*/ - "fmla v21.4s, v6.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 = - q6*/ - "fmla v24.4s, v6.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 = - q6*/ - "fmla v27.4s, v6.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 = - q6*/ - "fmla v30.4s, v6.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 = - q6*/ - "fmla v10.4s, v7.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 = - q7*/ - "fmla v13.4s, v7.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 = - q7*/ - "fmla v16.4s, v7.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 = - q7*/ - "fmla v19.4s, v7.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 = - q7*/ - "fmla v22.4s, v7.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 = - q7*/ - "fmla v25.4s, v7.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 = - q7*/ - "fmla v28.4s, v7.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 = - q7*/ - "fmla v31.4s, v7.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 = - q7*/ - "b 11f\n" - /* tails==1 final tail*/ - "3: \n" /* tail=1*/ - "ldr q6, [%[b_ptr]], #16\n" /* load b2 to q6*/ - "fmla v8.4s , v4.4s, v0.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ - "fmla v11.4s , v4.4s, v0.s[1]\n" /* out1 = b0 * a10[1], b0 = - q5*/ - "fmla v14.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ - "fmla v17.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ - "fmla v20.4s, v4.4s, v1.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ - "fmla v23.4s, v4.4s, v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ - "fmla v26.4s, v4.4s, v1.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ - "fmla v29.4s, v4.4s, v1.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ - "fmla v9.4s, v5.4s, v0.s[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ - "fmla v12.4s, v5.4s, v0.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/ - "fmla v15.4s, v5.4s, v0.s[2]\n" /* out10 = b1 * a10[2], b1 = - q6*/ - "fmla v18.4s, v5.4s, v0.s[3]\n" /* out11 = b1 * a10[3], b1 = - q6*/ - "fmla v21.4s, v5.4s, v1.s[0]\n" /* out12 = b1 * a10[0], b1 = - q6*/ - "fmla v24.4s, v5.4s, v1.s[1]\n" /* out13 = b1 * a10[1], b1 = - q6*/ - "fmla v27.4s, v5.4s, v1.s[2]\n" /* out14 = b1 * a10[2], b1 = - q6*/ - "fmla v30.4s, v5.4s, v1.s[3]\n" /* out15 = b1 * a10[3], b1 = - q6*/ - "fmla v10.4s, v6.4s, v0.s[0]\n" /* out16 = b2 * a10[0], b2 = - q7*/ - "fmla v13.4s, v6.4s, v0.s[1]\n" /* out17 = b2 * a10[0], b2 = - q7*/ - "fmla v16.4s, v6.4s, v0.s[2]\n" /* out18 = b2 * a10[0], b2 = - q7*/ - "fmla v19.4s, v6.4s, v0.s[3]\n" /* out19 = b2 * a10[0], b2 = - q7*/ - "fmla v22.4s, v6.4s, v1.s[0]\n" /* out20 = b2 * a10[0], b2 = - q7*/ - "fmla v25.4s, v6.4s, v1.s[1]\n" /* out21 = b2 * a10[0], b2 = - q7*/ - "fmla v28.4s, v6.4s, v1.s[2]\n" /* out22 = b2 * a10[0], b2 = - q7*/ - "fmla v31.4s, v6.4s, v1.s[3]\n" /* out23 = b2 * a10[0], b2 = - q7*/ - "b 11f\n" - /* tails==2 final tail*/ - "4:\n" /* tail = 2*/ - "fmla v8.4s , v7.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ - "fmla v11.4s , v7.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 = - q5*/ - "fmla v14.4s, v7.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ - "fmla v17.4s, v7.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ - "fmla v20.4s, v7.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ - "fmla v23.4s, v7.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ - "fmla v26.4s, v7.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ - "fmla v29.4s, v7.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ - "fmla v9.4s, v4.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ - "fmla v12.4s, v4.4s, v2.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/ - "fmla v15.4s, v4.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 = - q6*/ - "fmla v18.4s, v4.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 = - q6*/ - "fmla v21.4s, v4.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 = - q6*/ - "fmla v24.4s, v4.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 = - q6*/ - "fmla v27.4s, v4.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 = - q6*/ - "fmla v30.4s, v4.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 = - q6*/ - "fmla v10.4s, v5.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 = - q7*/ - "fmla v13.4s, v5.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 = - q7*/ - "fmla v16.4s, v5.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 = - q7*/ - "fmla v19.4s, v5.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 = - q7*/ - "fmla v22.4s, v5.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 = - q7*/ - "fmla v25.4s, v5.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 = - q7*/ - "fmla v28.4s, v5.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 = - q7*/ - "fmla v31.4s, v5.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 = - q7*/ - "b 11f\n" - /* tails==3 final tail*/ - "5:\n" /* tail = 3*/ - "ldr q4, [%[b_ptr]], #16\n" /* load b2, b0 to q4*/ - "fmla v8.4s , v6.4s, v0.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ - "fmla v11.4s , v6.4s, v0.s[1]\n" /* out1 = b0 * a10[1], b0 = - q5*/ - "fmla v14.4s, v6.4s, v0.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ - "fmla v17.4s, v6.4s, v0.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ - "fmla v20.4s, v6.4s, v1.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ - "fmla v23.4s, v6.4s, v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ - "fmla v26.4s, v6.4s, v1.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ - "fmla v29.4s, v6.4s, v1.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ - "fmla v9.4s, v7.4s, v0.s[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ - "fmla v12.4s, v7.4s, v0.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/ - "fmla v15.4s, v7.4s, v0.s[2]\n" /* out10 = b1 * a10[2], b1 = - q6*/ - "fmla v18.4s, v7.4s, v0.s[3]\n" /* out11 = b1 * a10[3], b1 = - q6*/ - "fmla v21.4s, v7.4s, v1.s[0]\n" /* out12 = b1 * a10[0], b1 = - q6*/ - "fmla v24.4s, v7.4s, v1.s[1]\n" /* out13 = b1 * a10[1], b1 = - q6*/ - "fmla v27.4s, v7.4s, v1.s[2]\n" /* out14 = b1 * a10[2], b1 = - q6*/ - "fmla v30.4s, v7.4s, v1.s[3]\n" /* out15 = b1 * a10[3], b1 = - q6*/ - "fmla v10.4s, v4.4s, v0.s[0]\n" /* out16 = b2 * a10[0], b2 = - q7*/ - "fmla v13.4s, v4.4s, v0.s[1]\n" /* out17 = b2 * a10[0], b2 = - q7*/ - "fmla v16.4s, v4.4s, v0.s[2]\n" /* out18 = b2 * a10[0], b2 = - q7*/ - "fmla v19.4s, v4.4s, v0.s[3]\n" /* out19 = b2 * a10[0], b2 = - q7*/ - "fmla v22.4s, v4.4s, v1.s[0]\n" /* out20 = b2 * a10[0], b2 = - q7*/ - "fmla v25.4s, v4.4s, v1.s[1]\n" /* out21 = b2 * a10[0], b2 = - q7*/ - "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a10[0], b2 = - q7*/ - "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a10[0], b2 = - q7*/ - "11: \n" /* check if relu */ - "cbz %w[relu], 12f\n" /* skip relu */ - "movi v2.4s, #0\n" /* for relu*/ - "fmax v8.4s, v8.4s, v2.4s\n" /* relu*/ - "fmax v9.4s, v9.4s, v2.4s\n" /* relu*/ - "fmax v10.4s, v10.4s, v2.4s\n" /* relu*/ - "fmax v11.4s, v11.4s, v2.4s\n" /* relu*/ - "fmax v12.4s, v12.4s, v2.4s\n" /* relu*/ - "fmax v13.4s, v13.4s, v2.4s\n" /* relu*/ - "fmax v14.4s, v14.4s, v2.4s\n" /* relu*/ - "fmax v15.4s, v15.4s, v2.4s\n" /* relu*/ - "fmax v16.4s,v16.4s,v2.4s\n" /* relu*/ - "fmax v17.4s,v17.4s,v2.4s\n" /* relu*/ - "fmax v18.4s, v18.4s, v2.4s\n" /* relu*/ - "fmax v19.4s, v19.4s, v2.4s\n" /* relu*/ - "fmax v20.4s, v20.4s, v2.4s\n" /* relu*/ - "fmax v21.4s, v21.4s, v2.4s\n" /* relu*/ - "fmax v22.4s, v22.4s, v2.4s\n" /* relu*/ - "fmax v23.4s, v23.4s, v2.4s\n" /* relu*/ - "fmax v24.4s,v24.4s,v2.4s\n" /* relu*/ - "fmax v25.4s,v25.4s,v2.4s\n" /* relu*/ - "fmax v26.4s, v26.4s, v2.4s\n" /* relu*/ - "fmax v27.4s, v27.4s, v2.4s\n" /* relu*/ - "fmax v28.4s, v28.4s, v2.4s\n" /* relu*/ - "fmax v29.4s, v29.4s, v2.4s\n" /* relu*/ - "fmax v30.4s, v30.4s, v2.4s\n" /* relu*/ - "fmax v31.4s, v31.4s, v2.4s\n" /* relu*/ - "12: \n" - "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n" /* store r0 */ - "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */ - "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */ - "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */ - "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */ - "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */ - "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */ - "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */ - - : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [k] "+r"(k), - [tail] "+r"(tail), [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3), - [c_ptr4] "+r"(c_ptr4), [c_ptr5] "+r"(c_ptr5), - [c_ptr6] "+r"(c_ptr6), [c_ptr7] "+r"(c_ptr7) - : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", - "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", - "v29", "v30", "v31"); - if (flag_p_remain && (xb == bblocks - 1)) { - for (int i = 0; i < remain; ++i) { - *pout0++ = cout0[i]; - *pout1++ = cout1[i]; - *pout2++ = cout2[i]; - *pout3++ = cout3[i]; - *pout4++ = cout4[i]; - *pout5++ = cout5[i]; - *pout6++ = cout6[i]; - *pout7++ = cout7[i]; - } - } - } - } - } -} -#else //__aarch64__ -/** - * \brief gemm with ablock = 6, bblock = 8, output 6x8 - * @param A - * @param B - * @param C - * @param M - * @param N - * @param K - * @param threads - * @param workspace - */ -void sgemm_conv_6x8(const float* A_packed, const float* B, const float* bias, - float* C, int M, int N, int K, bool is_bias, bool is_relu, - bool transB) { - const int threads = framework::CPUContext::Context()->get_thread_num(); - int l2_size = - framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float); - int l2_cache = l2_size > 0 ? l2_size : 512 * 1024; - - //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 - int x_block = - (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH)); - x_block /= NBLOCK; - x_block *= NBLOCK; - if (x_block != 0) { - int x_num = (N + (x_block - 1)) / x_block; - x_block = (N + x_num - 1) / x_num; - x_block = (x_block + NBLOCK - 1) / NBLOCK; - x_block *= NBLOCK; - } - x_block = x_block < NBLOCK ? NBLOCK : x_block; - int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; - int tail_pre = (K & (KBLOCK - 1)); - if (tail_pre == 0) { - tail_pre = KBLOCK; - } - - bool flag_p_remain = false; - int remain = 0; - - //! apanel is pre_compute outside gemm - for (unsigned int x0 = 0; x0 < N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - if (xmax > N) { - xmax = N; - } - int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK; - remain = xmax - x0 - (bblocks - 1) * NBLOCK; - if (remain > 0) { - flag_p_remain = true; - } - //! load bpanel - float* b_pannel = - static_cast(framework::CPUContext::Context()->get_work_space( - K * (xmax - x0) * sizeof(float))); - if (!transB) { - loadb(b_pannel, B, N, 0, K, x0, xmax); - } -#pragma omp parallel for num_threads(threads) - for (unsigned int y = 0; y < M; y += MBLOCK_OTH) { - unsigned int ymax = y + MBLOCK_OTH; - if (ymax > M) { - ymax = M; - } - float* c_ptr0 = C + y * N + x0; - float* c_ptr1 = c_ptr0 + N; - float* c_ptr2 = c_ptr1 + N; - float* c_ptr3 = c_ptr2 + N; - float* c_ptr4 = c_ptr3 + N; - float* c_ptr5 = c_ptr4 + N; - - float* pout0 = c_ptr0; - float* pout1 = c_ptr1; - float* pout2 = c_ptr2; - float* pout3 = c_ptr3; - float* pout4 = c_ptr4; - float* pout5 = c_ptr5; - - float bias_local[6] = {0}; - if (is_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; - bias_local[4] = bias[y + 4]; - bias_local[5] = bias[y + 5]; - } - - float cout0[NBLOCK]; - float cout1[NBLOCK]; - float cout2[NBLOCK]; - float cout3[NBLOCK]; - float cout4[NBLOCK]; - float cout5[NBLOCK]; - - const float* a_ptr_l = A_packed + y * K; - const float* b_ptr = b_pannel; - for (int xb = 0; xb < bblocks; xb++) { - if ((y + 5) >= ymax) { - switch ((y + 5) - ymax) { - case 4: - c_ptr1 = cout1; - case 3: - c_ptr2 = cout2; - case 2: - c_ptr3 = cout3; - case 1: - c_ptr4 = cout4; - case 0: - c_ptr5 = cout5; - default: - break; - } - } - if (flag_p_remain && (xb == bblocks - 1)) { - pout0 = c_ptr0; - pout1 = c_ptr1; - pout2 = c_ptr2; - pout3 = c_ptr3; - pout4 = c_ptr4; - pout5 = c_ptr5; - - c_ptr0 = cout0; - c_ptr1 = cout1; - c_ptr2 = cout2; - c_ptr3 = cout3; - c_ptr4 = cout4; - c_ptr5 = cout5; - } - const float* a_ptr = a_ptr_l; - int tails = tail_pre; - int k = k_pre; - asm volatile( - // sgemm 6x8 - "vld1.32 {d2-d4}, [%[bias_ptr]] @ load bias 6 elements\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a0~a3\n" - "pld [%[a_ptr]] @ preload a\n" - "vdup.i32 q12,d4[0] @ out40=0\n" - "pld [%[b_ptr]] @ preload b\n" - "vdup.i32 q13,d4[0] @ out41=0\n" - "pld [%[a_ptr], #64] @ preload a\n" - "vdup.i32 q14,d4[1] @ out50=0\n" - "pld [%[b_ptr], #64] @ preload b\n" - "vdup.i32 q15,d4[1] @ out51=0\n" - "pld [%[a_ptr], #128] @ preload a\n" - "vdup.i32 q4, d2[0] @ out00=0\n" - "pld [%[b_ptr], #128] @ preload b\n" - "vdup.i32 q5, d2[0] @ out01=0\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vdup.i32 q6, d2[1] @ out10=0\n" - "pld [%[a_ptr], #192] @ preload a\n" - "vdup.i32 q7, d2[1] @ out11=0\n" - "pld [%[b_ptr], #192] @ preload a\n" - "vdup.i32 q8, d3[0] @ out20=0\n" - "pld [%[a_ptr], #256] @ preload a\n" - "vdup.i32 q9, d3[0] @ out21=0\n" - "pld [%[b_ptr], #256] @ preload a\n" - "vdup.i32 q10,d3[1] @ out30=0\n" - "pld [%[b_ptr], #320] @ preload b\n" - "vdup.i32 q11,d3[1] @ out31=0\n" - "pld [%[b_ptr], #384] @ preload b\n" - "cmp %[k], #0 @ check weather k is " - "bigger than 0\n" - "beq 0f @ jump to tail\n" - "1: @ main loop for k\n" - /* Unroll 0*/ - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a4, a5, and next " - "a0, a1\n" - "vmla.f32 q4, q2, d0[0] @ out0 += b1 * a0\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "vmla.f32 q6, q2, d0[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d2[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d2[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d0[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d0[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d1[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d1[1] @ out9 += b2 * a3\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a2~a5\n" - "vmla.f32 q13, q3, d2[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d2[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - /* Unroll 1 */ - "vmla.f32 q4, q2, d3[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d3[1] @ out1 += b1 * a1\n" - /*"pld [%[a_ptr], #64] @ preload a\n"*/ - "vmla.f32 q8, q2, d0[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d0[1] @ out3 += b1 * a3\n" - /*"pld [%[b_ptr], #192]\n"*/ - "vmla.f32 q12, q2, d1[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d1[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d3[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d3[1] @ out7 += b2 * a1\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a0~a3\n" - "vmla.f32 q9, q3, d0[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d0[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d1[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d1[1] @ out11 += b2 * a5\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a4, a5, a0, a1\n" - /* Unroll 2 */ - "vmla.f32 q4, q2, d2[0] @ out0 += b1 * a0\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "vmla.f32 q6, q2, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d3[1] @ out3 += b1 * a3\n" - /*"pld [%[a_ptr], #240] @ preload\n"*/ - "vmla.f32 q12, q2, d0[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d0[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d2[1] @ out7 += b2 * a1\n" - /*"pld [%[b_ptr], #208]\n"*/ - "vmla.f32 q9, q3, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d3[1] @ out9 += b2 * a3\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a2~a5\n" - "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - /* Unroll 3 */ - "vmla.f32 q4, q2, d1[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d1[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d2[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d2[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d3[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d3[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d1[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d1[1] @ out7 += b2 * a1\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a0~a3\n" - "vmla.f32 q9, q3, d2[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d2[1] @ out9 += b2 * a3\n" - "subs %[k], %[k], #1 @ k--\n" - "vmla.f32 q13, q3, d3[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d3[1] @ out11 += b2 * a5\n" - "bne 1b @ jump to main loop\n" - "0: @ process tail\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "beq 3f @ jump to tail = 1\n" - /* Unroll 0*/ - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "vmla.f32 q4, q2, d0[0] @ out0 += b1 * a0\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a4,5, a0, a1\n" - "vmla.f32 q6, q2, d0[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d2[0] @ out4 += b1 * a4\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "vmla.f32 q14, q2, d2[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d0[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d0[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d1[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d1[1] @ out9 += b2 * a3\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a2~a5\n" - "vmla.f32 q13, q3, d2[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d2[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "beq 4f @ jump to tail==2\n" - /* Unroll 1*/ - "vmla.f32 q4, q2, d3[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d3[1] @ out1 += b1 * a1\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "vmla.f32 q8, q2, d0[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d0[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d1[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d1[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d3[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d3[1] @ out7 += b2 * a1\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a0~a3\n" - "vmla.f32 q9, q3, d0[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d0[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d1[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d1[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "beq 5f @ jump to tail==3\n" - /* Unroll 2 */ - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a4,a5, a0,a1\n" - "vmla.f32 q4, q2, d2[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d0[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d0[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d2[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d3[1] @ out9 += b2 * a3\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a2~a5\n" - "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - /* Unroll 3*/ - "vmla.f32 q4, q2, d1[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d1[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d2[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d2[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d3[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d3[1] @ out5 += b1 * a5\n" - "vmla.f32 q5, q3, d1[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d1[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d2[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d2[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d3[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d3[1] @ out11 += b2 * a5\n" - "b 2f\n" - /* tails==1 final tail*/ - "3: @ tail=1\n" - "vmla.f32 q4, q2, d0[0] @ out0 += b1 * a0\n" - "vld1.32 {d2}, [%[a_ptr] :64]! @ load a4,a5\n" - "vmla.f32 q6, q2, d0[1] @ out1 += b1 * a1\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "vmla.f32 q8, q2, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d2[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d2[1] @ out5 += b1 * a5\n" - "vmla.f32 q5, q3, d0[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d0[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d1[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d1[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d2[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d2[1] @ out11 += b2 * a5\n" - "b 2f @ jump to end\n" - /* tails==2 final tail*/ - "4: @ tail == 2\n" - "vmla.f32 q4, q2, d3[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d3[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d0[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d0[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d1[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d1[1] @ out5 += b1 * a5\n" - "vmla.f32 q5, q3, d3[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d3[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d0[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d0[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d1[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d1[1] @ out11 += b2 * a5\n" - "b 2f @ jump to end\n" - /* tails==3 final tail*/ - "5: @ tail=3\n" - "vmla.f32 q4, q2, d2[0] @ out0 += b1 * a0\n" - "vld1.32 {d0}, [%[a_ptr] :64]! @ load a4,a5\n" - "vmla.f32 q6, q2, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d0[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d0[1] @ out5 += b1 * a5\n" - "vmla.f32 q5, q3, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d2[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d3[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" - "2: @ check relu\n" - "cmp %[relu], #0 @ check if has relu\n" - "ble 6f @ skip relu if relu <= 0\n" - "vmov.u32 q0, #0 @ for relu\n" - "vmax.f32 q4, q4, q0 @ for relu\n" - "vmax.f32 q5, q5, q0 @ for relu\n" - "vmax.f32 q6, q6, q0 @ for relu\n" - "vmax.f32 q7, q7, q0 @ for relu\n" - "vmax.f32 q8, q8, q0 @ for relu\n" - "vmax.f32 q9, q9, q0 @ for relu\n" - "vmax.f32 q10, q10, q0 @ for relu\n" - "vmax.f32 q11, q11, q0 @ for relu\n" - "vmax.f32 q12, q12, q0 @ for relu\n" - "vmax.f32 q13, q13, q0 @ for relu\n" - "vmax.f32 q14, q14, q0 @ for relu\n" - "vmax.f32 q15, q15, q0 @ for relu\n" - "6: @ store result\n" - "vst1.32 {d8-d11}, [%[c_ptr0]]! @ store r0\n" - "vst1.32 {d12-d15}, [%[c_ptr1]]! @ store r1\n" - "vst1.32 {d16-d19}, [%[c_ptr2]]! @ store r2\n" - "vst1.32 {d20-d23}, [%[c_ptr3]]! @ store r3\n" - "vst1.32 {d24-d27}, [%[c_ptr4]]! @ store r4\n" - "vst1.32 {d28-d31}, [%[c_ptr5]]! @ store r5\n" - : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), [c_ptr4] "+r"(c_ptr4), - [c_ptr5] "+r"(c_ptr5), [k] "+r"(k), [tails] "+r"(tails) - : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15", "cc", "memory"); - - if (flag_p_remain && (xb == bblocks - 1)) { - for (int i = 0; i < remain; ++i) { - *pout0++ = cout0[i]; - *pout1++ = cout1[i]; - *pout2++ = cout2[i]; - *pout3++ = cout3[i]; - *pout4++ = cout4[i]; - *pout5++ = cout5[i]; - } - } - } - } - } -} - -void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias, - float* C, int M, int N, int K, bool is_bias, bool is_relu, - bool transB) { - const int threads = framework::CPUContext::Context()->get_thread_num(); - int l2_size = - framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float); - int l2_cache = l2_size > 0 ? l2_size : 512 * 1024; - - //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 - int x_block = - (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73)); - x_block /= NBLOCK; - x_block *= NBLOCK; - int x_num = (N + (x_block - 1)) / x_block; - x_block = (N + x_num - 1) / x_num; - x_block = (x_block + NBLOCK - 1) / NBLOCK; - x_block *= NBLOCK; - x_block = x_block < NBLOCK ? NBLOCK : x_block; - - int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; - int tail_pre = (K & (KBLOCK - 1)); - if (tail_pre == 0) { - tail_pre = KBLOCK; - } - - bool flag_p_remain = false; - int remain = 0; - - //! apanel is pre_compute outside gemm - for (unsigned int x0 = 0; x0 < N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - if (xmax > N) { - xmax = N; - } - int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK; - remain = xmax - x0 - (bblocks - 1) * NBLOCK; - if (remain > 0) { - flag_p_remain = true; - } - //! load bpanel - float* b_pannel = - static_cast(framework::CPUContext::Context()->get_work_space( - K * (xmax - x0) * sizeof(float))); - - if (!transB) { - loadb(b_pannel, B, N, 0, K, x0, xmax); - } -#pragma omp parallel for num_threads(threads) - for (unsigned int y = 0; y < M; y += MBLOCK_A73) { - unsigned int ymax = y + MBLOCK_A73; - if (ymax > M) { - ymax = M; - } - - float cout0[NBLOCK]; - float cout1[NBLOCK]; - float cout2[NBLOCK]; - float cout3[NBLOCK]; - - float bias_local[4] = {0}; - if (is_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; - } - - float* c_ptr0 = C + y * N + x0; - float* c_ptr1 = c_ptr0 + N; - float* c_ptr2 = c_ptr1 + N; - float* c_ptr3 = c_ptr2 + N; - - float* pout0 = c_ptr0; - float* pout1 = c_ptr1; - float* pout2 = c_ptr2; - float* pout3 = c_ptr3; - - const float* a_ptr_l = A_packed + y * K; - const float* b_ptr = b_pannel; - for (int xb = 0; xb < bblocks; xb++) { - if ((y + 3) >= ymax) { - switch ((y + 3) - ymax) { - case 2: - c_ptr1 = cout1; - case 1: - c_ptr2 = cout1; - case 0: - c_ptr3 = cout1; - default: - break; - } - } - if (flag_p_remain && (xb == bblocks - 1)) { - pout0 = c_ptr0; - pout1 = c_ptr1; - pout2 = c_ptr2; - pout3 = c_ptr3; - - c_ptr0 = cout0; - c_ptr1 = cout1; - c_ptr2 = cout2; - c_ptr3 = cout3; - } - const float* a_ptr = a_ptr_l; - int tails = tail_pre; - int k = k_pre; - asm volatile( - "vld1.32 {d4-d5}, [%[bias_ptr]] @ load bias\n" - "vld1.32 {d0-d3}, [%[a_ptr] :128]! @ load a0~a3\n" - "vdup.32 q8, d4[0] @ add bias to out00\n" - "pld [%[a_ptr]] @ preload a, 64byte\n" - "vdup.32 q9, d4[0] @ add bias to out01\n" - "pld [%[b_ptr]] @ preload b\n" - "vdup.32 q10, d4[1] @ add bias to out10\n" - "pld [%[a_ptr], #64] @ preload a\n" - "vdup.32 q11, d4[1] @ add bias to out11\n" - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load b1\n" - "vdup.32 q12, d5[0] @ add bias to out20\n" - "pld [%[b_ptr], #64] @ preload b\n" - "vdup.32 q13, d5[0] @ add bias to out21\n" - "pld [%[a_ptr], #128] @ preload a\n" - "vdup.32 q14, d5[1] @ add bias to out30\n" - "pld [%[b_ptr], #128] @ preload b\n" - "vdup.32 q15, d5[1] @ add bias to out31\n" - "pld [%[b_ptr], #192] @ preload b\n" - "cmp %[k], #0 @ check weather k is " - "bigger than 0\n" - "beq 0f @ jump to tail\n" - - "1: @ main loop for k\n" - /* Unroll 0*/ - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n" - "vmla.f32 q8, q4, d0[0] @ out0 += b1 * a0\n" - "vld1.32 {d4-d7}, [%[a_ptr] :128]! @ load next 2xa0~a3\n" - "vmla.f32 q10, q4, d0[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d0[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d0[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d1[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d1[1] @ out7 += b2 * a3\n" - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load next b1, b2\n" - /* Unroll 1 */ - "vmla.f32 q8, q6, d2[0] @ out0 += b1 * a0\n" - "pld [%[b_ptr], #64] @ preload b\n" - "vmla.f32 q10, q6, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q6, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q11, q7, d2[1] @ out7 += b2 * a1\n" - "vmla.f32 q13, q7, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q15, q7, d3[1] @ out9 += b2 * a3\n" - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n" - /* Unroll 2 */ - "vmla.f32 q8, q4, d4[0] @ out0 += b1 * a0\n" - "vld1.32 {d0-d3}, [%[a_ptr] :128]! @ load next a0~a3\n" - "vmla.f32 q10, q4, d4[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d5[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d5[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d4[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d4[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d5[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d5[1] @ out7 += b2 * a3\n" - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load next b1, b2\n" - /* Unroll 3 */ - "vmla.f32 q8, q6, d6[0] @ out0 += b1 * a0\n" - "pld [%[a_ptr], #64] @ preload a\n" - "vmla.f32 q10, q6, d6[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q6, d7[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d7[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d6[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q7, d6[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q7, d7[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q7, d7[1] @ out7 += b2 * a3\n" - "subs %[k], %[k], #1 @ k--\n" - "bne 1b @ jump to main loop\n" - - "0: @ process tail\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "beq 3f @ jump to tail = 1\n" - /* Unroll 0*/ - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n" - "vmla.f32 q8, q4, d0[0] @ out0 += b1 * a0\n" // b1*a1 - "vmla.f32 q10, q4, d0[1] @ out1 += b1 * a1\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "vmla.f32 q12, q4, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d0[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d0[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d1[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d1[1] @ out7 += b2 * a3\n" - "beq 4f @ jump to tail==2\n" - /* Unroll 1 */ - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load next b1, b2\n" - "vmla.f32 q8, q6, d2[0] @ out0 += b1 * a0\n" // b6*a2 - "vld1.32 {d4-d7}, [%[a_ptr] :128]! @ load next 2xa0~a3\n" - "vmla.f32 q10, q6, d2[1] @ out1 += b1 * a1\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "vmla.f32 q12, q6, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q11, q7, d2[1] @ out7 += b2 * a1\n" - "vmla.f32 q13, q7, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q15, q7, d3[1] @ out9 += b2 * a3\n" - "beq 5f @ jump to tail==3\n" - /* Unroll 2 */ - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n" - "vmla.f32 q8, q4, d4[0] @ out0 += b1 * a0\n" // b11 - // * - // a3 - "vmla.f32 q10, q4, d4[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d5[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d5[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d4[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d4[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d5[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d5[1] @ out7 += b2 * a3\n" - /* Unroll 3 */ - "vmla.f32 q8, q6, d6[0] @ out0 += b1 * a0\n" // b16 - // * - // a4 - "vmla.f32 q10, q6, d6[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q6, d7[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d7[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d6[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q7, d6[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q7, d7[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q7, d7[1] @ out7 += b2 * a3\n" - "b 2f\n" - /* tails==1 final tail */ - "3: @ tail=1\n" - "vmla.f32 q8, q4, d0[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q4, d0[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d0[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d0[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d1[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d1[1] @ out7 += b2 * a3\n" - /*aptr - 16 */ - "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" - "b 2f @ jump to end\n" - /* tails==2 final tail*/ - "4: @ tail == 2\n" - "vmla.f32 q8, q6, d2[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q6, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q6, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d2[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q7, d2[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q7, d3[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q7, d3[1] @ out7 += b2 * a3\n" - "b 2f @ jump to end\n" - /* tails==3 final tail*/ - "5: @ tail=3\n" - "vmla.f32 q8, q4, d4[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q4, d4[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d5[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d5[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d4[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d4[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d5[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d5[1] @ out7 += b2 * a3\n" - /*aptr - 16*/ - "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" - "2: @ check relu\n" - "cmp %[relu], #0 @ check if has relu\n" - "ble 6f @ skip relu if relu <= 0\n" - "vmov.u32 q0, #0 @ for relu\n" - "vmax.f32 q8, q8, q0 @ for relu\n" - "vmax.f32 q9, q9, q0 @ for relu\n" - "vmax.f32 q10, q10, q0 @ for relu\n" - "vmax.f32 q11, q11, q0 @ for relu\n" - "vmax.f32 q12, q12, q0 @ for relu\n" - "vmax.f32 q13, q13, q0 @ for relu\n" - "vmax.f32 q14, q14, q0 @ for relu\n" - "vmax.f32 q15, q15, q0 @ for relu\n" - "6: @ store result\n" - "vst1.32 {d16-d19}, [%[c_ptr0]]! @ store r0\n" - "vst1.32 {d20-d23}, [%[c_ptr1]]! @ store r1\n" - "vst1.32 {d24-d27}, [%[c_ptr2]]! @ store r2\n" - "vst1.32 {d28-d31}, [%[c_ptr3]]! @ store r3\n" - : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), [k] "+r"(k), [tails] "+r"(tails) - : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15", "cc", "memory"); - - if (flag_p_remain && (xb == bblocks - 1)) { - for (int i = 0; i < remain; ++i) { - *pout0++ = cout0[i]; - *pout1++ = cout1[i]; - *pout2++ = cout2[i]; - *pout3++ = cout3[i]; - } - } - } - } - } -} - -#endif //__aarch64__ -/// a: m*k b: k*n c: m*n -void sgemm_prepack(const float *A_packed, const float *B, const float *bias, - float *C, int M, int N, int K, bool is_bias, bool is_relu, - bool is_transB, ARMArch arch) { -#ifdef __aarch64__ - sgemm_conv_8x12(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB); -#else // armv7 - if (arch == A73) { - sgemm_conv_4x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB); - } else { - sgemm_conv_6x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB); - } -#endif // arm64 -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // CONV_OP -#endif // __ARM_NEON__ diff --git a/mobile/src/operators/math/gemm/gemm1x1s1.h b/mobile/src/operators/math/gemm/gemm1x1s1.h deleted file mode 100644 index 19dcdccdb9..0000000000 --- a/mobile/src/operators/math/gemm/gemm1x1s1.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#pragma once -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -#ifdef __aarch64__ -const int MBLOCK = 8; -const int NBLOCK = 12; -const int KBLOCK = 4; -inline int get_hblock(ARMArch arch) { return MBLOCK; } -#else -const int MBLOCK_A73 = 4; -const int MBLOCK_OTH = 6; -const int NBLOCK = 8; -const int KBLOCK = 4; - -inline int get_hblock(ARMArch arch) { - if (arch == A73) { - return MBLOCK_A73; - } else { - return MBLOCK_OTH; - } -} -#endif // __aarch64__ - -void gemm1x1s1_transform_weight(const framework::Tensor& weight, - const framework::Tensor& output, - framework::Tensor* trans_weight, - const int group, ARMArch arch); - -void sgemm_prepack(const float* A_packed, const float* B, const float* bias, - float* C, int M, int N, int K, bool is_bias, bool is_relu, - bool is_transB, ARMArch arch); - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // CONV_OP diff --git a/mobile/src/operators/math/gemm/gemm_kernel.h b/mobile/src/operators/math/gemm/gemm_kernel.h deleted file mode 100644 index 0f3089b204..0000000000 --- a/mobile/src/operators/math/gemm/gemm_kernel.h +++ /dev/null @@ -1,792 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#include -#include -#include "operators/math/math.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -#if __aarch64__ -void sgemm_6x16(const float *lhs, const float *rhs, const int k, float *output, - const int ldc) { - int kc1 = k; - int step = 4 * ldc; - int step1 = 4 * 6; - asm volatile( - "dup v6.4s, wzr \n\t" - "dup v7.4s, wzr \n\t" - "dup v8.4s, wzr \n\t" - "dup v9.4s, wzr \n\t" - "dup v10.4s, wzr \n\t" - "dup v11.4s, wzr \n\t" - "dup v12.4s, wzr \n\t" - "dup v13.4s, wzr \n\t" - - "dup v14.4s, wzr \n\t" - "dup v15.4s, wzr \n\t" - "dup v16.4s, wzr \n\t" - "dup v17.4s, wzr \n\t" - "dup v18.4s, wzr \n\t" - "dup v19.4s, wzr \n\t" - "dup v20.4s, wzr \n\t" - "dup v21.4s, wzr \n\t" - - "dup v22.4s, wzr \n\t" - "dup v23.4s, wzr \n\t" - "dup v24.4s, wzr \n\t" - "dup v25.4s, wzr \n\t" - "dup v26.4s, wzr \n\t" - "dup v27.4s, wzr \n\t" - "dup v28.4s, wzr \n\t" - "dup v29.4s, wzr \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "blt 2f \n\t" - "1: \n\t" - - "prfm pldl1keep, [%[lhs], #32] \n\t" - "prfm pldl1keep, [%[rhs], #64] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[lhs]], %[step1] \n\t" - "ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [%[rhs]], #64 \n\t" - - "fmla v6.4s, v2.4s, v0.s[0] \n\t" - "fmla v7.4s, v3.4s, v0.s[0] \n\t" - "fmla v8.4s, v4.4s, v0.s[0] \n\t" - "fmla v9.4s, v5.4s, v0.s[0] \n\t" - - "fmla v10.4s, v2.4s, v0.s[1] \n\t" - "fmla v11.4s, v3.4s, v0.s[1] \n\t" - "fmla v12.4s, v4.4s, v0.s[1] \n\t" - "fmla v13.4s, v5.4s, v0.s[1] \n\t" - - "fmla v14.4s, v2.4s, v0.s[2] \n\t" - "fmla v15.4s, v3.4s, v0.s[2] \n\t" - "fmla v16.4s, v4.4s, v0.s[2] \n\t" - "fmla v17.4s, v5.4s, v0.s[2] \n\t" - - "fmla v18.4s, v2.4s, v0.s[3] \n\t" - "fmla v19.4s, v3.4s, v0.s[3] \n\t" - "fmla v20.4s, v4.4s, v0.s[3] \n\t" - "fmla v21.4s, v5.4s, v0.s[3] \n\t" - - "fmla v22.4s, v2.4s, v1.s[0] \n\t" - "fmla v23.4s, v3.4s, v1.s[0] \n\t" - "fmla v24.4s, v4.4s, v1.s[0] \n\t" - "fmla v25.4s, v5.4s, v1.s[0] \n\t" - - "fmla v26.4s, v2.4s, v1.s[1] \n\t" - "fmla v27.4s, v3.4s, v1.s[1] \n\t" - "fmla v28.4s, v4.4s, v1.s[1] \n\t" - "fmla v29.4s, v5.4s, v1.s[1] \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "bge 1b \n\t" - "2: \n\t" - - "st1 {v6.4s, v7.4s, v8.4s, v9.4s}, [%[c]], %[step] \n\t" - "st1 {v10.4s, v11.4s, v12.4s, v13.4s}, [%[c]], %[step] \n\t" - "st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [%[c]], %[step] \n\t" - "st1 {v18.4s, v19.4s, v20.4s, v21.4s}, [%[c]], %[step] \n\t" - "st1 {v22.4s, v23.4s, v24.4s, v25.4s}, [%[c]], %[step] \n\t" - "st1 {v26.4s, v27.4s, v28.4s, v29.4s}, [%[c]], %[step] \n\t" - : [lhs] "+r"(lhs), [rhs] "+r"(rhs), [c] "+r"(output), [kc1] "+r"(kc1) - : [step] "r"(step), [step1] "r"(step1) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", - "v29"); -} -#else -void sgemm_6x8(const float *lhs, const float *rhs, const int k, float *output, - const int ldc) { - int kc1 = k >> 3; // k / 8 - int kc2 = k & 0x7; // k % 8 - int step = sizeof(float) * ldc; - asm volatile( - "pld [%[lhs]] \n\t" - "pld [%[lhs], #64] \n\t" - "pld [%[rhs]] \n\t" - "pld [%[rhs], #64] \n\t" - - "vmov.f32 q4, #0.0 \n\t" - "vmov.f32 q5, #0.0 \n\t" - "vmov.f32 q6, #0.0 \n\t" - "vmov.f32 q7, #0.0 \n\t" - "vmov.f32 q8, #0.0 \n\t" - "vmov.f32 q9, #0.0 \n\t" - "vmov.f32 q10, #0.0 \n\t" - "vmov.f32 q11, #0.0 \n\t" - "vmov.f32 q12, #0.0 \n\t" - "vmov.f32 q13, #0.0 \n\t" - "vmov.f32 q14, #0.0 \n\t" - "vmov.f32 q15, #0.0 \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "blt 2f \n\t" - "1: \n\t" - - "pld [%[lhs], #128] \n\t" - "pld [%[rhs], #128] \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "pld [%[lhs], #128] \n\t" - "pld [%[rhs], #128] \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "pld [%[lhs], #128] \n\t" - "pld [%[rhs], #128] \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "pld [%[lhs], #128] \n\t" - "pld [%[rhs], #128] \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "bge 1b \n\t" - "2: \n\t" - - "subs %[kc2], %[kc2], #1 \n\t" - "blt 4f \n\t" - "3: \n\t" - - "vld1.32 {d0-d2}, [%[lhs]]! \n\t" - "vld1.32 {q2, q3}, [%[rhs]]! \n\t" - - "vmla.f32 q4, q2, d0[0] \n\t" - "vmla.f32 q5, q3, d0[0] \n\t" - "vmla.f32 q6, q2, d0[1] \n\t" - "vmla.f32 q7, q3, d0[1] \n\t" - "vmla.f32 q8, q2, d1[0] \n\t" - "vmla.f32 q9, q3, d1[0] \n\t" - "vmla.f32 q10, q2, d1[1] \n\t" - "vmla.f32 q11, q3, d1[1] \n\t" - "vmla.f32 q12, q2, d2[0] \n\t" - "vmla.f32 q13, q3, d2[0] \n\t" - "vmla.f32 q14, q2, d2[1] \n\t" - "vmla.f32 q15, q3, d2[1] \n\t" - - "subs %[kc2], %[kc2], #1 \n\t" - "bge 3b \n\t" - "4: \n\t" - - "mov r5, %[c] \n\t" - "mov r6, %[step] \n\t" - "vst1.32 {q4, q5}, [r5], r6 \n\t" - "vst1.32 {q6, q7}, [r5], r6 \n\t" - "vst1.32 {q8, q9}, [r5], r6 \n\t" - "vst1.32 {q10, q11}, [r5], r6 \n\t" - "vst1.32 {q12, q13}, [r5], r6 \n\t" - "vst1.32 {q14, q15}, [r5] \n\t" - : - : [lhs] "r"(lhs), [rhs] "r"(rhs), [c] "r"(output), [kc1] "r"(kc1), - [kc2] "r"(kc2), [step] "r"(step) - : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} -#endif // __aarch64__ - -void sgemv_notrans_mx1(const int M, const int N, const float alpha, - const float *A, const int lda, const float *B, - const float beta, float *C) { - uint32_t mask[4] = {0, 1, 2, 3}; - int remain_n = N & 0x3; - uint32x4_t vmask = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n)); - float32x4_t _valpha = vdupq_n_f32(alpha); - - #pragma omp parallel for - for (int m = 0; m < M - 3; m += 4) { - const float *in0 = A + m * lda; - const float *in1 = in0 + lda; - const float *in2 = in1 + lda; - const float *in3 = in2 + lda; - float *output = C + m; - - float32x4_t _sum0, _sum1, _sum2, _sum3; - _sum0 = vdupq_n_f32(0.f); - _sum1 = vdupq_n_f32(0.f); - _sum2 = vdupq_n_f32(0.f); - _sum3 = vdupq_n_f32(0.f); - int n = 0; - for (; n < N - 3; n += 4) { - float32x4_t _r0 = vld1q_f32(in0 + n); - float32x4_t _r1 = vld1q_f32(in1 + n); - float32x4_t _r2 = vld1q_f32(in2 + n); - float32x4_t _r3 = vld1q_f32(in3 + n); - float32x4_t _b = vld1q_f32(B + n); - _sum0 = vmlaq_f32(_sum0, _r0, _b); - _sum1 = vmlaq_f32(_sum1, _r1, _b); - _sum2 = vmlaq_f32(_sum2, _r2, _b); - _sum3 = vmlaq_f32(_sum3, _r3, _b); - } - if (n < N) { - float32x4_t _r0 = vld1q_f32(in0 + n); - float32x4_t _r1 = vld1q_f32(in1 + n); - float32x4_t _r2 = vld1q_f32(in2 + n); - float32x4_t _r3 = vld1q_f32(in3 + n); - float32x4_t _b = vld1q_f32(B + n); - _r0 = vandq_f32_u32(_r0, vmask); - _r1 = vandq_f32_u32(_r1, vmask); - _r2 = vandq_f32_u32(_r2, vmask); - _r3 = vandq_f32_u32(_r3, vmask); - _b = vandq_f32_u32(_b, vmask); - _sum0 = vmlaq_f32(_sum0, _r0, _b); - _sum1 = vmlaq_f32(_sum1, _r1, _b); - _sum2 = vmlaq_f32(_sum2, _r2, _b); - _sum3 = vmlaq_f32(_sum3, _r3, _b); - } - _sum0 = vpaddq_f32(_sum0, _sum1); - _sum2 = vpaddq_f32(_sum2, _sum3); - _sum0 = vpaddq_f32(_sum0, _sum2); - _sum0 = vmulq_f32(_sum0, _valpha); - if (beta != 0.f) { - _sum2 = vmulq_n_f32(vld1q_f32(output), beta); - _sum0 = vaddq_f32(_sum0, _sum2); - } - // restore - vst1q_f32(output, _sum0); - } - // remain m - for (int m = (M & 0xfffffffc); m < M; ++m) { - const float *in0 = A + m * lda; - float *output = C + m; - float32x4_t _sum0 = vdupq_n_f32(0.f); - - int n = 0; - for (; n < N - 3; n += 4) { - float32x4_t _r0 = vld1q_f32(in0 + n); - float32x4_t _b = vld1q_f32(B + n); - _sum0 = vmlaq_f32(_sum0, _r0, _b); - } - if (n < N) { - float32x4_t _r0 = vld1q_f32(in0 + n); - float32x4_t _b = vld1q_f32(B + n); - _r0 = vandq_f32_u32(_r0, vmask); - _b = vandq_f32_u32(_b, vmask); - _sum0 = vmlaq_f32(_sum0, _r0, _b); - } - float32x2_t _ss = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0)); - float32x2_t _sss2 = vpadd_f32(_ss, _ss); - *output = - vget_lane_f32(_sss2, 0) * vgetq_lane_f32(_valpha, 0) + beta * (*output); - } -} - -void sgemv_notrans_mx1_faster(const int M, const int N, const float alpha, - const float *A, const int lda, const float *B, - const float beta, float *C) { -#pragma omp parallel for - for (int m = 0; m < M - 3; m += 4) { - const float *a_ptr0 = A + m * lda; - const float *a_ptr1 = a_ptr0 + lda; - const float *a_ptr2 = a_ptr1 + lda; - const float *a_ptr3 = a_ptr2 + lda; - const float *b_ptr = B; - float *c_ptr = C + m; - float sum0 = 0.f; - float sum1 = 0.f; - float sum2 = 0.f; - float sum3 = 0.f; - int n = 0; - -#if __ARM_NEON - /* matrix_mul_float: - * Calculate matrix A(4xN) * matrix B(Nx1) and store to a result array - * sum_arr[4], a 4x8 * 8x1 will be calculated on each iteration. - * - * Variable: a_ptr0 = pointer to the first row of matrix A, row major order - * Variable: a_ptr1 = pointer to the second row of matrix A, row major order - * Variable: a_ptr2 = pointer to the third row of matrix A, row major order - * Variable: a_ptr3 = pointer to the fourth row of matrix A, row major order - * Variable: b_ptr = pointer to the first col of matrix B, col major order - * Variable: s_ptr = pointer to the sum result array - * Variable: loop = the numbers of loops - * - * Register: Q(V)4-Q(V)11 = matrix A - * Register: Q(V)0-Q(V)1 = matrix B - * Register: Q(V)12-Q(V)15 = matrix C - */ - - float sum_arr[4] = {0.f}; - float *s_ptr = sum_arr; - int loop = N / 8; - -#if __aarch64__ - - if (loop > 0) { - asm volatile( - // set v12-v15 to 0 - "movi v12.4s, #0 \n" - "movi v13.4s, #0 \n" - "movi v14.4s, #0 \n" - "movi v15.4s, #0 \n" - - "0: \n" - // load A and B - "ld1 {v0.4s, v1.4s}, [%[b_ptr]] , #32 \n" - "ld1 {v4.4s, v5.4s}, [%[a_ptr0]], #32 \n" - "ld1 {v6.4s, v7.4s}, [%[a_ptr1]], #32 \n" - "ld1 {v8.4s, v9.4s}, [%[a_ptr2]], #32 \n" - "ld1 {v10.4s, v11.4s}, [%[a_ptr3]], #32 \n" - - "fmla v12.4s, v4.4s, v0.4s \n" // s0=A(r0c0-r0c3)*B(r0-r3) - "fmla v13.4s, v6.4s, v0.4s \n" // s1=A(r1c0-r1c3)*B(r0-r3) - "fmla v14.4s, v8.4s, v0.4s \n" // s2=A(r2c0-r2c3)*B(r0-r3) - "fmla v15.4s, v10.4s, v0.4s \n" // s3=A(r3c0-r3c3)*B(r0-r3) - - "fmla v12.4s, v5.4s, v1.4s \n" // s0=A(r0c4-r0c7)*B(r4-r7) - "fmla v13.4s, v7.4s, v1.4s \n" // s1=A(r1c4-r1c7)*B(r4-r7) - "fmla v14.4s, v9.4s, v1.4s \n" // s2=A(r2c4-r2c7)*B(r4-r7) - "fmla v15.4s, v11.4s, v1.4s \n" // s3=A(r3c4-r3c7)*B(r4-r7) - - // cycle - "subs %[loop], %[loop], #1 \n" - "bne 0b \n" - - // add and store - "faddp v4.4s, v12.4s, v13.4s \n" - "faddp v5.4s, v14.4s, v15.4s \n" - "faddp v6.4s, v4.4s, v5.4s \n" - "st1 {v6.4s}, [%[s_ptr]] \n" - - : [loop] "+r"(loop), [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1), - [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3), [b_ptr] "+r"(b_ptr) - : [s_ptr] "r"(s_ptr) - : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "cc", "memory"); - } -#else // __aarch64__ - - if (loop > 0) { - asm volatile( - - // set Q12-Q15 to 0 - "vmov.i32 q12, #0 \n" - "vmov.i32 q13, #0 \n" - "vmov.i32 q14, #0 \n" - "vmov.i32 q15, #0 \n" - - "0: \n" - // load A and B - "vld1.f32 {d0-d3}, [%[b_ptr]]! \n" - "vld1.f32 {d8-d11}, [%[a_ptr0]]! \n" - "vld1.f32 {d12-d15}, [%[a_ptr1]]! \n" - "vld1.f32 {d16-d19}, [%[a_ptr2]]! \n" - "vld1.f32 {d20-d23}, [%[a_ptr3]]! \n" - - "vmla.f32 q12, q4, q0 \n" // s0=A(r0c0-r0c3)*B(r0-r3) - "vmla.f32 q13, q6, q0 \n" // s1=A(r1c0-r1c3)*B(r0-r3) - "vmla.f32 q14, q8, q0 \n" // s2=A(r2c0-r2c3)*B(r0-r3) - "vmla.f32 q15, q10, q0 \n" // s3=A(r3c0-r3c3)*B(r0-r3) - - "vmla.f32 q12, q5, q1 \n" // s0=A(r0c4-r0c7)*B(r4-r7) - "vmla.f32 q13, q7, q1 \n" // s1=A(r1c4-r1c7)*B(r4-r7) - "vmla.f32 q14, q9, q1 \n" // s2=A(r2c4-r2c7)*B(r4-r7) - "vmla.f32 q15, q11, q1 \n" // s3=A(r3c4-r3c7)*B(r4-r7) - - // cycle - "subs %[loop], #1 \n" - "bne 0b \n" - // add and store - "vpadd.f32 d8, d24, d25 \n" - "vpadd.f32 d9, d26, d27 \n" - "vpadd.f32 d10, d28, d29 \n" - "vpadd.f32 d11, d30, d31 \n" - - "vpadd.f32 d12, d8, d9 \n" - "vpadd.f32 d13, d10, d11 \n" - "vst1.32 {d12-d13}, [%[s_ptr]] \n" - - : [loop] "+r"(loop), [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1), - [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3), [b_ptr] "+r"(b_ptr) - : [s_ptr] "r"(s_ptr) - : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", - "q13", "q14", "q15", "cc", "memory"); - } -#endif // __aarch64__ - sum0 += s_ptr[0]; - sum1 += s_ptr[1]; - sum2 += s_ptr[2]; - sum3 += s_ptr[3]; - n = N - (N & 0x07); -#endif // __ARM_NEON - - for (; n < N - 7; n += 8) { - sum0 += a_ptr0[0] * b_ptr[0]; - sum1 += a_ptr1[0] * b_ptr[0]; - sum2 += a_ptr2[0] * b_ptr[0]; - sum3 += a_ptr3[0] * b_ptr[0]; - - sum0 += a_ptr0[1] * b_ptr[1]; - sum1 += a_ptr1[1] * b_ptr[1]; - sum2 += a_ptr2[1] * b_ptr[1]; - sum3 += a_ptr3[1] * b_ptr[1]; - - sum0 += a_ptr0[2] * b_ptr[2]; - sum1 += a_ptr1[2] * b_ptr[2]; - sum2 += a_ptr2[2] * b_ptr[2]; - sum3 += a_ptr3[2] * b_ptr[2]; - - sum0 += a_ptr0[3] * b_ptr[3]; - sum1 += a_ptr1[3] * b_ptr[3]; - sum2 += a_ptr2[3] * b_ptr[3]; - sum3 += a_ptr3[3] * b_ptr[3]; - - sum0 += a_ptr0[4] * b_ptr[4]; - sum1 += a_ptr1[4] * b_ptr[4]; - sum2 += a_ptr2[4] * b_ptr[4]; - sum3 += a_ptr3[4] * b_ptr[4]; - - sum0 += a_ptr0[5] * b_ptr[5]; - sum1 += a_ptr1[5] * b_ptr[5]; - sum2 += a_ptr2[5] * b_ptr[5]; - sum3 += a_ptr3[5] * b_ptr[5]; - - sum0 += a_ptr0[6] * b_ptr[6]; - sum1 += a_ptr1[6] * b_ptr[6]; - sum2 += a_ptr2[6] * b_ptr[6]; - sum3 += a_ptr3[6] * b_ptr[6]; - - sum0 += a_ptr0[7] * b_ptr[7]; - sum1 += a_ptr1[7] * b_ptr[7]; - sum2 += a_ptr2[7] * b_ptr[7]; - sum3 += a_ptr3[7] * b_ptr[7]; - - a_ptr0 += 8; - a_ptr1 += 8; - a_ptr2 += 8; - a_ptr3 += 8; - b_ptr += 8; - } - - for (; n < N; ++n) { - sum0 += a_ptr0[0] * b_ptr[0]; - sum1 += a_ptr1[0] * b_ptr[0]; - sum2 += a_ptr2[0] * b_ptr[0]; - sum3 += a_ptr3[0] * b_ptr[0]; - - a_ptr0 += 1; - a_ptr1 += 1; - a_ptr2 += 1; - a_ptr3 += 1; - b_ptr += 1; - } - c_ptr[0] = alpha * sum0 + beta * c_ptr[0]; - c_ptr[1] = alpha * sum1 + beta * c_ptr[1]; - c_ptr[2] = alpha * sum2 + beta * c_ptr[2]; - c_ptr[3] = alpha * sum3 + beta * c_ptr[3]; - } - - int m_tail_start = M - (M & 0x03); - for (int m = m_tail_start; m < M; ++m) { - const float *a_ptr = A + m * lda; - const float *b_ptr = B; - float *c_ptr = C + m; - float sum = 0.f; - for (int n = 0; n < N; n++) { - sum += a_ptr[0] * b_ptr[0]; - a_ptr += 1; - b_ptr += 1; - } - c_ptr[0] = alpha * sum + beta * c_ptr[0]; - } -} - -void sgemv_trans_mx1(const int M, const int N, const float alpha, - const float *A, const int lda, const float *B, - const float beta, float *C) { -// create buff_c to store temp computation result for each threading -#ifdef _OPENMP - int threads_num = omp_get_max_threads(); -#else - int threads_num = 1; -#endif // _OPENMP - float *buf_c = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * threads_num * M)); - memset(buf_c, 0, threads_num * M * sizeof(float)); - - #pragma omp parallel for - for (int n = 0; n < N - 3; n += 4) { -#ifdef _OPENMP - const int tid = omp_get_thread_num(); -#else - const int tid = 0; -#endif // _OPENMP - float *thread_buf_c = buf_c + tid * M; - const float *in0 = A + n * lda; - const float *in1 = in0 + lda; - const float *in2 = in1 + lda; - const float *in3 = in2 + lda; - float32x4_t _b = vld1q_f32(B + n); - float32x4_t _sum0; - int m = 0; - for (; m < M - 3; m += 4) { - float32x4_t _r0 = vld1q_f32(in0 + m); - float32x4_t _r1 = vld1q_f32(in1 + m); - float32x4_t _r2 = vld1q_f32(in2 + m); - float32x4_t _r3 = vld1q_f32(in3 + m); - float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m); - - _sum0 = vmulq_lane_f32(_r0, vget_low_f32(_b), 0); - _sum0 = vmlaq_lane_f32(_sum0, _r1, vget_low_f32(_b), 1); - _sum0 = vmlaq_lane_f32(_sum0, _r2, vget_high_f32(_b), 0); - _sum0 = vmlaq_lane_f32(_sum0, _r3, vget_high_f32(_b), 1); - _sum0 = vaddq_f32(_sum0, _vbuff_c); - - vst1q_f32(thread_buf_c + m, _sum0); - } - if (m < M) { - float32x4_t _sum0 = vdupq_n_f32(0.0f); - float32x4_t _r0 = vld1q_f32(in0 + m); - float32x4_t _r1 = vld1q_f32(in1 + m); - float32x4_t _r2 = vld1q_f32(in2 + m); - float32x4_t _r3 = vld1q_f32(in3 + m); - float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m); - - _sum0 = vmulq_lane_f32(_r0, vget_low_f32(_b), 0); - _sum0 = vmlaq_lane_f32(_sum0, _r1, vget_low_f32(_b), 1); - _sum0 = vmlaq_lane_f32(_sum0, _r2, vget_high_f32(_b), 0); - _sum0 = vmlaq_lane_f32(_sum0, _r3, vget_high_f32(_b), 1); - _sum0 = vaddq_f32(_sum0, _vbuff_c); - switch (M - m) { - case 3: - vst1q_lane_f32(thread_buf_c + m + 2, _sum0, 2); - case 2: - vst1_f32(thread_buf_c + m, vget_low_f32(_sum0)); - break; - case 1: - vst1q_lane_f32(thread_buf_c + m, _sum0, 0); - break; - } - } - } - - // remain n - #pragma omp parallel for - for (int n = (N & 0xfffffffc); n < N; ++n) { -#ifdef _OPENMP - const int tid = omp_get_thread_num(); -#else - const int tid = 0; -#endif // _OPENMP - float *thread_buf_c = buf_c + tid * M; - const float *in0 = A + n * lda; - float32x4_t _b = vld1q_dup_f32(B + n); - float32x4_t _sum0; - int m = 0; - for (; m < M - 3; m += 4) { - float32x4_t _r0 = vld1q_f32(in0 + m); - float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m); - _sum0 = vmulq_f32(_r0, _b); - _sum0 = vaddq_f32(_sum0, _vbuff_c); - vst1q_f32(thread_buf_c + m, _sum0); - } - for (; m < M; ++m) { - thread_buf_c[m] += in0[m] * B[n]; - } - } - - // reduction operate for buf_c, sum to C and do left operations - // y := alpha * A' * X + beta * y - // reduction operate: sum multi-threadings result for over-all: A' * X - float32x4_t _valpha = vdupq_n_f32(alpha); - if (beta == 0.f) { - #pragma omp parallel for - for (int m = 0; m < M - 3; m += 4) { - float32x4_t _sum0 = vld1q_f32(buf_c + m); - for (int tid = 1; tid < threads_num; ++tid) { - _sum0 += vld1q_f32(buf_c + tid * M + m); - } - vst1q_f32(C + m, _sum0 * _valpha); - } - - for (int m = (M & 0xfffffffc); m < M; ++m) { - float _sum0 = *(buf_c + m); - for (int tid = 1; tid < threads_num; ++tid) { - _sum0 += *(buf_c + tid * M + m); - } - C[m] = _sum0 * alpha; - } - } else { // beta != 0.f - float32x4_t _vbeta = vdupq_n_f32(beta); - #pragma omp parallel for - for (int m = 0; m < M - 3; m += 4) { - float32x4_t _sum0 = vld1q_f32(buf_c + m); - for (int tid = 1; tid < threads_num; ++tid) { - _sum0 += vld1q_f32(buf_c + tid * M + m); - } - float32x4_t _vc = vld1q_f32(C + m); - vst1q_f32(C + m, _sum0 * _valpha + _vbeta * _vc); - } - - for (int m = (M & 0xfffffffc); m < M; ++m) { - float _sum0 = *(buf_c + m); - for (int tid = 1; tid < threads_num; ++tid) { - _sum0 += *(buf_c + tid * M + m); - } - C[m] = _sum0 * alpha + beta * C[m]; - } - } - - // free buff_c - paddle_mobile::memory::Free(buf_c); -} - -void sgemv_mx1(const bool trans, const int M, const int N, const float alpha, - const float *A, const int lda, const float *B, const float beta, - float *C) { - if (trans) { - sgemv_trans_mx1(M, N, alpha, A, lda, B, beta, C); - } else { - // sgemv_notrans_mx1(M, N, alpha, A, lda, B, beta, C); - sgemv_notrans_mx1_faster(M, N, alpha, A, lda, B, beta, C); - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // __ARM_NEON__ diff --git a/mobile/src/operators/math/gemm/pack_kernel.h b/mobile/src/operators/math/gemm/pack_kernel.h deleted file mode 100644 index d3b1359610..0000000000 --- a/mobile/src/operators/math/gemm/pack_kernel.h +++ /dev/null @@ -1,801 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#include -#ifdef _OPENMP -#include -#endif -#include "operators/math/math.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -void pack_lhs_6r(const int m, const int k, const float *A, const int lda, - float *output, const bool unroll) { - uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5}; - int remain_k = k & 0x3; - uint32x4_t vzero = vdupq_n_u32(0); - uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k)); - - #pragma omp parallel for if (unroll) - for (int i = 0; i < m - 5; i += 6) { - const float *a0 = A + i * lda; - const float *a1 = A + (i + 1) * lda; - const float *a2 = A + (i + 2) * lda; - const float *a3 = A + (i + 3) * lda; - const float *a4 = A + (i + 4) * lda; - const float *a5 = A + (i + 5) * lda; - float *out_ptr = output + i * k; - - int loops = k >> 2; - if (loops > 0) { -#if __aarch64__ - for (int l = 0; l < loops; ++l) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = - vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - _d3 = - vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1])); - - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); - vst1q_f32(out_ptr + 18, _d3); - vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1])); - - a0 += 4; - a1 += 4; - a2 += 4; - a3 += 4; - a4 += 4; - a5 += 4; - out_ptr += 24; - } -#else - asm volatile( - "loop_4k_%=: \n" - "vld1.32 {d0-d1}, [%[a0]]! \n" - "vld1.32 {d2-d3}, [%[a1]]! \n" - "vld1.32 {d4-d5}, [%[a2]]! \n" - "vld1.32 {d6-d7}, [%[a3]]! \n" - "vld1.32 {d8-d9}, [%[a4]]! \n" - "vld1.32 {d10-d11}, [%[a5]]! \n" - "vtrn.32 q0, q1 \n" - "vtrn.32 q2, q3 \n" - "vtrn.32 q4, q5 \n" - "vswp.32 d1, d4 \n" - "vswp.32 d3, d6 \n" - - "vst1.32 {q0}, [%[out]]! \n" - "vst1.32 {d8}, [%[out]]! \n" - "vst1.32 {q1}, [%[out]]! \n" - "vst1.32 {d10}, [%[out]]! \n" - "vst1.32 {q2}, [%[out]]! \n" - "vst1.32 {d9}, [%[out]]! \n" - "vst1.32 {q3}, [%[out]]! \n" - "vst1.32 {d11}, [%[out]]! \n" - - "subs %[loops], #1 \n" - "bne loop_4k_%= \n" - : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2), - [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); -#endif - } - - if (remain_k > 0) { - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - _d0 = vandq_f32_u32(_d0, vmask1); - _d1 = vandq_f32_u32(_d1, vmask1); - _d2 = vandq_f32_u32(_d2, vmask1); - _d3 = vandq_f32_u32(_d3, vmask1); - _d4 = vandq_f32_u32(_d4, vmask1); - _d5 = vandq_f32_u32(_d5, vmask1); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - - switch (remain_k) { - case 3: - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); - case 2: - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); - case 1: - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); - default: - break; - } - } - } - - int remain_m = m % 6; - if (remain_m) { - int remain_m_start = m - remain_m; - const float *a0 = A + remain_m_start * lda; - const float *a1 = a0 + lda; - const float *a2 = a0 + 2 * lda; - const float *a3 = a0 + 3 * lda; - const float *a4 = a0 + 4 * lda; - const float *a5 = a0 + 5 * lda; - float *out_ptr = output + remain_m_start * k; - - uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m)); - uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m)); - const float zerobuff[4] = {0.f, 0.f, 0.f, 0.f}; - - int lk = 0; - for (; lk < k - 3; lk += 4) { - switch (remain_m) { - case 1: - a1 = zerobuff; - case 2: - a2 = zerobuff; - case 3: - a3 = zerobuff; - case 4: - a4 = zerobuff; - case 5: - a5 = zerobuff; - default: - break; - } -#if __aarch64__ - float32x4_t _d0 = vld1q_f32(a0); - float32x4_t _d1 = vld1q_f32(a1); - float32x4_t _d2 = vld1q_f32(a2); - float32x4_t _d3 = vld1q_f32(a3); - float32x4_t _d4 = vld1q_f32(a4); - float32x4_t _d5 = vld1q_f32(a5); - - float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); - _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1])); - - _d0 = vandq_f32_u32(_d0, vmask2); - _d1 = vandq_f32_u32(_d1, vmask2); - _d2 = vandq_f32_u32(_d2, vmask2); - _d3 = vandq_f32_u32(_d3, vmask2); - _d4 = vandq_f32_u32(_q3.val[0], vmask3); - _d5 = vandq_f32_u32(_q3.val[1], vmask3); - - vst1q_f32(out_ptr, _d0); - vst1_f32(out_ptr + 4, vget_low_f32(_d4)); - vst1q_f32(out_ptr + 6, _d1); - vst1_f32(out_ptr + 10, vget_low_f32(_d5)); - vst1q_f32(out_ptr + 12, _d2); - vst1_f32(out_ptr + 16, vget_high_f32(_d4)); - vst1q_f32(out_ptr + 18, _d3); - vst1_f32(out_ptr + 22, vget_high_f32(_d5)); - - a0 += 4; - a1 += 4; - a2 += 4; - a3 += 4; - a4 += 4; - a5 += 4; - out_ptr += 24; -#else - asm volatile( - "vld1.32 {d0-d1}, [%[a0]]! \n" - "vld1.32 {d2-d3}, [%[a1]]! \n" - "vld1.32 {d4-d5}, [%[a2]]! \n" - "vld1.32 {d6-d7}, [%[a3]]! \n" - "vld1.32 {d8-d9}, [%[a4]]! \n" - "vld1.32 {d10-d11}, [%[a5]]! \n" - "vtrn.32 q0, q1 \n" - "vtrn.32 q2, q3 \n" - "vtrn.32 q4, q5 \n" - "vswp.32 d1, d4 \n" - "vswp.32 d3, d6 \n" - - "vbif q0, %q[vzero], %q[vmask2] \n" - "vbif q1, %q[vzero], %q[vmask2] \n" - "vbif q2, %q[vzero], %q[vmask2] \n" - "vbif q3, %q[vzero], %q[vmask2] \n" - "vbif q4, %q[vzero], %q[vmask3] \n" - "vbif q5, %q[vzero], %q[vmask3] \n" - - "vst1.32 {q0}, [%[out]]! \n" - "vst1.32 {d8}, [%[out]]! \n" - "vst1.32 {q1}, [%[out]]! \n" - "vst1.32 {d10}, [%[out]]! \n" - "vst1.32 {q2}, [%[out]]! \n" - "vst1.32 {d9}, [%[out]]! \n" - "vst1.32 {q3}, [%[out]]! \n" - "vst1.32 {d11}, [%[out]]! \n" - : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2), - [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5) - : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); -#endif - } - // remain k - switch (remain_m) { - case 1: - a1 = zerobuff; - case 2: - a2 = zerobuff; - case 3: - a3 = zerobuff; - case 4: - a4 = zerobuff; - case 5: - a5 = zerobuff; - default: - break; - } - for (; lk < k; ++lk) { - *out_ptr++ = *a0++; - *out_ptr++ = *a1++; - *out_ptr++ = *a2++; - *out_ptr++ = *a3++; - *out_ptr++ = *a4++; - *out_ptr++ = *a5++; - } - } -} - -#if __aarch64__ -void pack_rhs_16c(int k, int n, const float *B, int ldb, float *output, - const bool unroll) { - uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - uint32_t remain_n = n & 0x7; - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n)); - uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_n)); - - #pragma omp parallel for if (unroll) - for (int i = 0; i < k - 3; i += 4) { - const float *b0 = B + i * ldb; - const float *b1 = b0 + ldb; - const float *b2 = b1 + ldb; - const float *b3 = b2 + ldb; - int j = 0; - asm volatile( - "prfm pldl1keep, [%[b0]] \n" - "prfm pldl1keep, [%[b1]] \n" - "prfm pldl1keep, [%[b2]] \n" - "prfm pldl1keep, [%[b3]] \n" - : - : [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3)); - - for (; j < n - 15; j += 16) { - float *out_ptr0 = output + j * k + 16 * i; - asm volatile( - "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[b0]], #64 \n" - "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[b1]], #64 \n" - "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[out_ptr0]], #64 \n" - "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[out_ptr0]], #64 \n" - - "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[b2]], #64 \n" - "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[b3]], #64 \n" - "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[out_ptr0]], #64 \n" - "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[out_ptr0]], #64 \n" - : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1), - [b2] "+r"(b2), [b3] "+r"(b3) - : - : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); - } - for (; j < n - 7; j += 8) { - float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF); - int step = 64; - asm volatile( - "ld1 {v0.4s, v1.4s}, [%[b0]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[b1]], #32 \n" - "ld1 {v4.4s, v5.4s}, [%[b2]], #32 \n" - "ld1 {v6.4s, v7.4s}, [%[b3]], #32 \n" - - "st1 {v0.4s, v1.4s}, [%[out_ptr0]], %[step] \n" - "st1 {v2.4s, v3.4s}, [%[out_ptr0]], %[step] \n" - "st1 {v4.4s, v5.4s}, [%[out_ptr0]], %[step] \n" - "st1 {v6.4s, v7.4s}, [%[out_ptr0]], %[step] \n" - : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1), - [b2] "+r"(b2), [b3] "+r"(b3) - : [step] "r"(step) - : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); - } - if (j < n) { - float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF); - int step = 64; - asm volatile( - "ld1 {v0.4s, v1.4s}, [%[b0]] \n" - "ld1 {v2.4s, v3.4s}, [%[b1]] \n" - "ld1 {v4.4s, v5.4s}, [%[b2]] \n" - "ld1 {v6.4s, v7.4s}, [%[b3]] \n" - - "and v0.16b, v0.16b, %[vmask1].16b \n" - "and v1.16b, v1.16b, %[vmask2].16b \n" - "and v2.16b, v2.16b, %[vmask1].16b \n" - "and v3.16b, v3.16b, %[vmask2].16b \n" - "and v4.16b, v4.16b, %[vmask1].16b \n" - "and v5.16b, v5.16b, %[vmask2].16b \n" - "and v6.16b, v6.16b, %[vmask1].16b \n" - "and v7.16b, v7.16b, %[vmask2].16b \n" - - "st1 {v0.4s, v1.4s}, [%[out_ptr0]], %[step] \n" - "st1 {v2.4s, v3.4s}, [%[out_ptr0]], %[step] \n" - "st1 {v4.4s, v5.4s}, [%[out_ptr0]], %[step] \n" - "st1 {v6.4s, v7.4s}, [%[out_ptr0]], %[step] \n" - : [out_ptr0] "+r"(out_ptr0) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0), - [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3), [step] "r"(step) - : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); - j += 8; - } - - if (j & 0xf) { - float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF); - vst1q_f32(out_ptr0, vzero); - vst1q_f32(out_ptr0 + 4, vzero); - out_ptr0 += 16; - vst1q_f32(out_ptr0, vzero); - vst1q_f32(out_ptr0 + 4, vzero); - out_ptr0 += 16; - vst1q_f32(out_ptr0, vzero); - vst1q_f32(out_ptr0 + 4, vzero); - out_ptr0 += 16; - vst1q_f32(out_ptr0, vzero); - vst1q_f32(out_ptr0 + 4, vzero); - } - } - // remain k - for (int i = (k & 0xFFFFFFFC); i < k; ++i) { - const float *b0 = B + i * ldb; - int j = 0; - asm volatile("prfm pldl1keep, [%[b0]] \n" - : - : [b0] "r"(b0)); - - for (; j < n - 15; j += 16) { - float *out_ptr0 = output + j * k + 16 * i; - asm volatile( - "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[b0]], #64 \n" - "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[out_ptr0]], #64 \n" - : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0) - : - : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); - } - for (; j < n - 7; j += 8) { - float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF); - int step = 64; - asm volatile( - "ld1 {v0.4s, v1.4s}, [%[b0]], #32 \n" - "st1 {v0.4s, v1.4s}, [%[out_ptr0]], %[step] \n" - : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0) - : [step] "r"(step) - : "memory", "v0", "v1"); - } - if (j < n) { - float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF); - asm volatile( - "ld1 {v0.4s, v1.4s}, [%[b0]] \n" - "and v0.16b, v0.16b, %[vmask1].16b \n" - "and v1.16b, v1.16b, %[vmask2].16b \n" - "st1 {v0.4s, v1.4s}, [%[out_ptr0]] \n" - : [out_ptr0] "+r"(out_ptr0) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0) - : "memory", "v0", "v1"); - j += 8; - } - if (j & 0xf) { - float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF); - vst1q_f32(out_ptr0, vzero); - vst1q_f32(out_ptr0 + 4, vzero); - } - } -} -#else - -void pack_rhs_8c(int k, int n, const float *B, int ldb, float *output, - const bool unroll) { - uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - uint32_t remain_n = n & 0x7; - uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n)); - uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_n)); - - #pragma omp parallel for if (unroll) - for (int i = 0; i < k - 3; i += 4) { - const float *b0 = B + i * ldb; - const float *b1 = b0 + ldb; - const float *b2 = b1 + ldb; - const float *b3 = b2 + ldb; - int j = 0; - for (; j < n - 15; j += 16) { - float *out_ptr0 = output + j * k + 8 * i; - float *out_ptr1 = out_ptr0 + 8 * k; - asm volatile( - "vld1.32 {q0, q1}, [%[b0]]! \n" - "vld1.32 {q2, q3}, [%[b1]]! \n" - "vld1.32 {q4, q5}, [%[b0]]! \n" - "vld1.32 {q6, q7}, [%[b1]]! \n" - "vst1.32 {q0, q1}, [%[out_ptr0]]! \n" - "vst1.32 {q2, q3}, [%[out_ptr0]]! \n" - "vst1.32 {q4, q5}, [%[out_ptr1]]! \n" - "vst1.32 {q6, q7}, [%[out_ptr1]]! \n" - - "vld1.32 {q0, q1}, [%[b2]]! \n" - "vld1.32 {q2, q3}, [%[b3]]! \n" - "vld1.32 {q4, q5}, [%[b2]]! \n" - "vld1.32 {q6, q7}, [%[b3]]! \n" - "vst1.32 {q0, q1}, [%[out_ptr0]]! \n" - "vst1.32 {q2, q3}, [%[out_ptr0]]! \n" - "vst1.32 {q4, q5}, [%[out_ptr1]]! \n" - "vst1.32 {q6, q7}, [%[out_ptr1]]! \n" - : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [b0] "+r"(b0), - [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3) - : - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); - } - for (; j < n - 7; j += 8) { - float *out_ptr0 = output + j * k + 8 * i; - asm volatile( - "vld1.32 {q0, q1}, [%[b0]]! \n" - "vld1.32 {q2, q3}, [%[b1]]! \n" - "vld1.32 {q4, q5}, [%[b2]]! \n" - "vld1.32 {q6, q7}, [%[b3]]! \n" - "vst1.32 {q0, q1}, [%[out_ptr0]]! \n" - "vst1.32 {q2, q3}, [%[out_ptr0]]! \n" - "vst1.32 {q4, q5}, [%[out_ptr0]]! \n" - "vst1.32 {q6, q7}, [%[out_ptr0]]! \n" - : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1), - [b2] "+r"(b2), [b3] "+r"(b3) - : - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); - } - if (j < n) { - float *out_ptr0 = output + j * k + 8 * i; - asm volatile( - "vld1.32 {q0, q1}, [%[b0]] \n" - "vld1.32 {q2, q3}, [%[b1]] \n" - "vld1.32 {q4, q5}, [%[b2]] \n" - "vld1.32 {q6, q7}, [%[b3]] \n" - "vand q0, q0, %q[vmask1] \n" - "vand q1, q1, %q[vmask2] \n" - "vand q2, q2, %q[vmask1] \n" - "vand q3, q3, %q[vmask2] \n" - "vand q4, q4, %q[vmask1] \n" - "vand q5, q5, %q[vmask2] \n" - "vand q6, q6, %q[vmask1] \n" - "vand q7, q7, %q[vmask2] \n" - - "vst1.32 {q0, q1}, [%[out_ptr0]]! \n" - "vst1.32 {q2, q3}, [%[out_ptr0]]! \n" - "vst1.32 {q4, q5}, [%[out_ptr0]]! \n" - "vst1.32 {q6, q7}, [%[out_ptr0]]! \n" - : [out_ptr0] "+r"(out_ptr0) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0), - [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); - } - } - // remain k - for (int i = (k & 0xFFFFFFFC); i < k; ++i) { - const float *b0 = B + i * ldb; - int j = 0; - for (; j < n - 15; j += 16) { - float *out_ptr0 = output + j * k + 8 * i; - float *out_ptr1 = out_ptr0 + 8 * k; - asm volatile( - "vld1.32 {q0, q1}, [%[b0]]! \n" - "vld1.32 {q2, q3}, [%[b0]]! \n" - "vst1.32 {q0, q1}, [%[out_ptr0]]! \n" - "vst1.32 {q2, q3}, [%[out_ptr1]]! \n" - : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [b0] "+r"(b0) - : - : "memory", "q0", "q1", "q2", "q3"); - } - for (; j < n - 7; j += 8) { - float *out_ptr0 = output + j * k + 8 * i; - asm volatile( - "vld1.32 {q0, q1}, [%[b0]]! \n" - "vst1.32 {q0, q1}, [%[out_ptr0]]! \n" - : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0) - : - : "memory", "q0", "q1"); - } - if (j < n) { - float *out_ptr0 = output + j * k + 8 * i; - asm volatile( - "vld1.32 {q0, q1}, [%[b0]] \n" - "vand q0, q0, %q[vmask1] \n" - "vand q1, q1, %q[vmask2] \n" - "vst1.32 {q0, q1}, [%[out_ptr0]] \n" - : [out_ptr0] "+r"(out_ptr0) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0) - : "memory", "q0", "q1"); - } - } -} -#endif // __aarch64__ - -void write_back_alpha_beta(const int mc, const int nc, const float alpha, - const float *c, const int ldc1, const float beta, - float *C, const int ldc2) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - float32x4_t _alpha = vdupq_n_f32(alpha); - float32x4_t _beta = vdupq_n_f32(beta); - float32x4_t cv, cv2; - for (int i = 0; i < mc; ++i) { - const float *c_ptr = c + i * ldc1; - float *C_ptr = C + i * ldc2; - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv = vmulq_f32(_alpha, cv); - cv2 = vld1q_f32(C_ptr); - cv = vmlaq_f32(cv, _beta, cv2); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv = vmulq_f32(_alpha, cv); - cv2 = vld1q_f32(C_ptr); - cv = vmlaq_f32(cv, _beta, cv2); - switch (_nc1) { - case 3: - vst1q_lane_f32(C_ptr + 2, cv, 2); - case 2: - vst1_f32(C_ptr, vget_low_f32(cv)); - break; - case 1: - vst1q_lane_f32(C_ptr, cv, 0); - break; - } - } - } -} - -#if __aarch64__ -void write_back_alpha1_beta0(const int mc, const int nc, const float *c, - const int ldc1, float *C, const int ldc2) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - const float *c_ptr; - float *C_ptr; - float32x4_t cv; - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * ldc1; - C_ptr = C + i * ldc2; - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - switch (_nc1) { - case 3: - vst1q_lane_f32(C_ptr + 2, cv, 2); - case 2: - vst1_f32(C_ptr, vget_low_f32(cv)); - break; - case 1: - vst1q_lane_f32(C_ptr, cv, 0); - break; - } - } - } -} - -void write_back_alpha1_beta1(const int mc, const int nc, const float *c, - const int ldc1, float *C, const int ldc2) { - int nc1 = nc / 4; - int _nc1 = nc % 4; - - const float *c_ptr; - float *C_ptr; - float32x4_t cv, cv2; - for (int i = 0; i < mc; ++i) { - c_ptr = c + i * ldc1; - C_ptr = C + i * ldc2; - for (int j = 0; j < nc1; ++j) { - cv = vld1q_f32(c_ptr); - cv2 = vld1q_f32(C_ptr); - cv = vaddq_f32(cv, cv2); - vst1q_f32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_f32(c_ptr); - cv2 = vld1q_f32(C_ptr); - cv = vaddq_f32(cv, cv2); - switch (_nc1) { - case 3: - vst1q_lane_f32(C_ptr + 2, cv, 2); - case 2: - vst1_f32(C_ptr, vget_low_f32(cv)); - break; - case 1: - vst1q_lane_f32(C_ptr, cv, 0); - break; - } - } - } -} - -#else -void write_back_alpha1_beta0(const int mc, const int nc, const float *c, - const int ldc1, float *C, const int ldc2) { - int nc1 = nc / 16; - int nc2 = nc % 16; - int step1 = 4 * (ldc1 - 16 * nc1); - int step2 = 4 * ldc2; - int volatile m = mc; - - const float *volatile c_ptr = c; - float *volatile C_ptr = C; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vst1.32 {q0, q1}, [r6]! \n\t" - - "vld1.32 {q2, q3}, [%[c_ptr]]! \n\t" - "vst1.32 {q2, q3}, [r6]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "add %[C_ptr], %[C_ptr], %[step2] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), - [step1] "r"(step1), [step2] "r"(step2) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3"); - } - - if (nc2 != 0) { - for (int i = 0; i < mc; i++) { - const float *c0 = c_ptr + nc1 * 16 + i * ldc1; - float *C0 = C_ptr + nc1 * 16 + i * ldc2; - for (int j = 0; j < nc2; j++) { - *C0++ = *c0++; - } - } - } -} - -void write_back_alpha1_beta1(const int mc, const int nc, const float *c, - const int ldc1, float *C, const int ldc2) { - int nc1 = nc / 16; - int nc2 = nc % 16; - int step1 = 4 * (ldc1 - 16 * nc1); - int step2 = 4 * ldc2; - int volatile m = mc; - - const float *volatile c_ptr = c; - float *volatile C_ptr = C; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vld1.32 {q2, q3}, [r6] \n\t" - "vadd.f32 q0, q0, q2 \n\t" - "vadd.f32 q1, q1, q3 \n\t" - "vst1.32 {q0, q1}, [r6]! \n\t" - - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vld1.32 {q2, q3}, [r6] \n\t" - "vadd.f32 q0, q0, q2 \n\t" - "vadd.f32 q1, q1, q3 \n\t" - "vst1.32 {q0, q1}, [r6]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "add %[C_ptr], %[C_ptr], %[step2] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), - [step1] "r"(step1), [step2] "r"(step2) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3"); - } - - if (nc2 != 0) { - for (int i = 0; i < mc; i++) { - const float *c0 = c_ptr + nc1 * 16 + i * ldc1; - float *C0 = C_ptr + nc1 * 16 + i * ldc2; - for (int j = 0; j < nc2; j++) { - *C0++ += *c0++; - } - } - } -} -#endif // __aarch64__ - -void write_back(const int mc, const int nc, const float alpha, const float *c, - const int ldc1, const float beta, float *C, const int ldc2) { - if (alpha == 1.f && beta == 0.f) { - write_back_alpha1_beta0(mc, nc, c, ldc1, C, ldc2); - } else if (alpha == 1.f && beta == 1.f) { - write_back_alpha1_beta1(mc, nc, c, ldc1, C, ldc2); - } else { - write_back_alpha_beta(mc, nc, alpha, c, ldc1, beta, C, ldc2); - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // __ARM_NEON__ diff --git a/mobile/src/operators/math/gemm/strategy.h b/mobile/src/operators/math/gemm/strategy.h deleted file mode 100644 index 11e24fb1c3..0000000000 --- a/mobile/src/operators/math/gemm/strategy.h +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "operators/math/gemm/gemm_kernel.h" -#include "operators/math/gemm/pack_kernel.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -struct SgemmStrategy { - typedef float Itype; - typedef float Otype; - - typedef void (*packLhsFunc)(const int, const int, const Itype *, const int, - Itype *, const bool); - typedef void (*packRhsFunc)(const int, const int, const Itype *, const int, - Itype *, const bool); - typedef void (*kernelFunc)(const Itype *, const Itype *, const int, Otype *, - const int); - typedef void (*WriteFunc)(const int, const int, const float alpha, - const Otype *, const int, const float beta, Otype *, - const int); - - packLhsFunc pack_lhs; - packRhsFunc pack_rhs; - kernelFunc kernel; - WriteFunc write; - - static int out_width() { -#if __aarch64__ - return 16; -#else - return 8; -#endif - } - - static int out_height() { return 6; } - - SgemmStrategy() { - pack_lhs = pack_lhs_6r; -#if __aarch64__ - pack_rhs = pack_rhs_16c; - kernel = sgemm_6x16; -#else - pack_rhs = pack_rhs_8c; - kernel = sgemm_6x8; -#endif - write = write_back; - } -}; - -struct I8o32gemmStrategy { - typedef int8_t Itype; - typedef int32_t Otype; - - typedef void (*kern_type)(const Itype *, const Itype *, const int, Otype *, - const int); - kern_type kernel; - - static int out_width() { return 8; } - - static int out_height() { -#if __aarch64__ - return 12; -#else - return 6; -#endif - } - - I8o32gemmStrategy() {} -}; - -struct SgemvStrategy { - typedef float Itype; - typedef float Otype; - - typedef void (*kernelFunc)(const bool, const int, const int, const float, - const Itype *, const int, const Itype *, - const float, Otype *); - kernelFunc kernel; - - SgemvStrategy() { kernel = sgemv_mx1; } -}; - -struct I8o32gemvStrategy { - typedef int8_t Itype; - typedef int32_t Otype; - - typedef void (*kern_type)(const Itype *, const Itype *, const int, Otype *, - const int); - kern_type kernel; - - static int out_width() { return 1; } - - static int out_height() { -#if __aarch64__ - return 12; -#else - return 6; -#endif - } -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/gemm_int8.cpp b/mobile/src/operators/math/gemm_int8.cpp deleted file mode 100644 index 19a5b88cbe..0000000000 --- a/mobile/src/operators/math/gemm_int8.cpp +++ /dev/null @@ -1,2077 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "common/log.h" -#include "operators/math/gemm.h" -#if __ARM_NEON -#include -#include - -#endif -#ifdef _OPENMP -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { -void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc) { -#if __ARM_NEON -#if __aarch64__ -// AddDot4x8 used only for aarch32 -#else - const int8_t *a_ptr, *b_ptr; - a_ptr = a; - b_ptr = b; - int32_t kc1 = k >> 3; - int32_t kc2 = k & 7; - int32_t kc3 = kc2 >> 2; - int32_t kc4 = kc2 & 3; - int32_t kc5 = kc4 >> 1; - int32_t kc6 = kc4 & 1; - int32_t step = sizeof(int32_t) * ldc; - asm volatile( - // q8-q15: save 32 results - "pld [%[a_ptr]] \n\t" - "pld [%[b_ptr]] \n\t" - "pld [%[b_ptr], #64] \n\t" - "vmov.s32 q8, #0 \n\t" - "vmov.s32 q9, q8 \n\t" - "vmov.s32 q10, q8 \n\t" - "vmov.s32 q11, q8 \n\t" - "vmov.s32 q12, q8 \n\t" - "vmov.s32 q13, q8 \n\t" - "vmov.s32 q14, q8 \n\t" - "vmov.s32 q15, q8 \n\t" - "subs %[kc1], %[kc1], #1 \n\t" - "blt 1f \n\t" - "0: \n\t" - "pld [%[a_ptr], #64] \n\t" - "pld [%[b_ptr], #128] \n\t" - "vld1.s8 {d0-d3}, [%[a_ptr]]! \n\t" // load A 8 cols - "vld1.s8 {d8-d11}, [%[b_ptr]]! \n\t" // load B first 4 rows - "vmovl.s8 q2, d0 \n\t" // process B first - // rows - "vmovl.s8 q3, d8 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vmovl.s8 q3, d9 \n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - "vld1.s8 {d12-d15}, [%[b_ptr]]! \n\t" // load B second 4 - // rows - "vmovl.s8 q2, d1 \n\t" - "vmovl.s8 q3, d10 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vmovl.s8 q3, d11 \n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - "vmovl.s8 q2, d2 \n\t" // process B second 4 - // rows - "vmovl.s8 q3, d12 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vmovl.s8 q3, d13 \n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - "vmovl.s8 q2, d3 \n\t" - "vmovl.s8 q3, d14 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vmovl.s8 q3, d15 \n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "bge 0b \n\t" - "1: \n\t" // last 4 rows - "subs %[kc3], %[kc3], #1 \n\t" - "blt 2f \n\t" - "vld1.s8 {d0-d1}, [%[a_ptr]]! \n\t" // load A 4 cols - "vld1.s8 {d8-d11}, [%[b_ptr]]! \n\t" // load B 4 rows - "vmovl.s8 q2, d0 \n\t" - "vmovl.s8 q3, d8 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vmovl.s8 q3, d9 \n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - "vmovl.s8 q2, d1 \n\t" - "vmovl.s8 q3, d10 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vmovl.s8 q3, d11 \n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - "2: \n\t" // last 2 rows - "subs %[kc5], %[kc5], #1 \n\t" - "blt 3f \n\t" - "vld1.s8 {d0}, [%[a_ptr]]! \n\t" // load A 2 cols - "vld1.s8 {d8-d9}, [%[b_ptr]]! \n\t" // load B 2 rows - "vmovl.s8 q2, d0 \n\t" - "vmovl.s8 q3, d8 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vmovl.s8 q3, d9 \n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - "3: \n\t" // last 1 row - "subs %[kc6], %[kc6], #1 \n\t" - "blt 4f \n\t" - "vld1.s8 {d0}, [%[a_ptr]] \n\t" // load A 1 col - "vld1.s8 {d8}, [%[b_ptr]] \n\t" // load B 1 row - "vmovl.s8 q2, d0 \n\t" - "vmovl.s8 q3, d8 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "4: \n\t" - "vst1.32 {q8, q9}, [%[c]], %[step] \n\t" - "vst1.32 {q10, q11}, [%[c]], %[step] \n\t" - "vst1.32 {q12, q13}, [%[c]], %[step] \n\t" - "vst1.32 {q14, q15}, [%[c]] \n\t" - : - : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), - [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -#endif // __aarch64__ -#endif // __ARM_NEON -} - -// The core idea of AddDot4x2 and AddDot4x4 function is borrowed from the -// Google's gemmlowp open source library. The address of gemmlowp is -// https://github.com/google/gemmlowp. -void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc) { -#if __ARM_NEON -#if __aarch64__ -// AddDot4x2 used only for aarch32 -#else -#define PADDLE_LABEL_LOOP "1" -#define PADDLE_LABEL_AFTER_LOOP "2" - asm volatile( - "lsl %[ldc], %[ldc], #2 \n\t" // sizeof(int32) == 4 - "vldr d0, [%[b], #0] \n\t" - "vmov.s32 q8, #0 \n\t" - "vldr d4, [%[a], #0] \n\t" - "vmov.s32 q9, q8 \n\t" - "vldr d2, [%[b], #16] \n\t" - "vmov.s32 q10, q8 \n\t" - "vldr d6, [%[a], #16] \n\t" - "vmov.s32 q11, q8 \n\t" - "vldr d1, [%[b], #8]\n\t" - "vmov.s32 q12, q8 \n\t" - "vldr d5, [%[a], #8]\n" - "vmov.s32 q13, q8 \n\t" - "vldr d3, [%[b], #24]\n\t" - "vmov.s32 q14, q8 \n\t" - "vldr d7, [%[a], #24]\n" - "vmov.s32 q15, q8 \n\t" - - PADDLE_LABEL_LOOP - ": \n\t" - "vmull.s8 q4, d0, d4 \n\t" // first half - "add %[b], %[b], #32 \n\t" - "vmull.s8 q5, d2, d4 \n\t" - "vldr d4, [%[a], #32] \n\t" - "vmull.s8 q6, d0, d6 \n\t" - "vmull.s8 q7, d2, d6 \n\t" - "vldr d6, [%[a], #48] \n\t" - - "vmlal.s8 q4, d1, d5 \n\t" // second half - "vmlal.s8 q5, d3, d5 \n\t" - "vldr d5, [%[a], #40] \n\t" - "vmlal.s8 q6, d1, d7 \n\t" - "vmlal.s8 q7, d3, d7 \n\t" - "vldr d7, [%[a], #56] \n\t" - - "vpadal.s16 q8, q4 \n\t" // pairwise-add - "add %[a], %[a], #64 \n\t" - "vpadal.s16 q9, q5 \n\t" - "subs %[k], %[k], #16 \n\t" - "vpadal.s16 q10, q6 \n\t" - "vpadal.s16 q11, q7 \n\t" - - "beq " PADDLE_LABEL_AFTER_LOOP - "f \n\t" - - "vmull.s8 q4, d0, d4 \n\t" // first half - "vmull.s8 q5, d2, d4 \n\t" - "vldr d4, [%[a], #0] \n\t" - "vmull.s8 q6, d0, d6 \n\t" - "vldr d0, [%[b], #0] \n\t" - "vmull.s8 q7, d2, d6 \n\t" - "vldr d2, [%[b], #16] \n\t" - - "vmlal.s8 q4, d1, d5 \n\t" // second half - "vldr d6, [%[a], #16] \n\t" - "vmlal.s8 q5, d3, d5 \n\t" - "vldr d5, [%[a], #8] \n\t" - "vmlal.s8 q6, d1, d7 \n\t" - "vldr d1, [%[b], #8] \n\t" - "vmlal.s8 q7, d3, d7 \n\t" - "vldr d3, [%[b], #24] \n\t" - - "vpadal.s16 q12, q4 \n\t" // pairwise-add - "vldr d7, [%[a], #24] \n\t" - "vpadal.s16 q13, q5 \n\t" - "vpadal.s16 q14, q6 \n\t" - "vpadal.s16 q15, q7 \n\t" - - "b " PADDLE_LABEL_LOOP "b \n\t" - - PADDLE_LABEL_AFTER_LOOP - ": \n\t" - "vmull.s8 q4, d0, d4 \n\t" // first half - "vmull.s8 q5, d2, d4 \n\t" - "vmull.s8 q6, d0, d6 \n\t" - "vmull.s8 q7, d2, d6 \n\t" - - "vmlal.s8 q4, d1, d5 \n\t" // second half - "vmlal.s8 q5, d3, d5 \n\t" - "vmlal.s8 q6, d1, d7 \n\t" - "vmlal.s8 q7, d3, d7 \n\t" - - "vpadal.s16 q12, q4 \n\t" // pairwise-add - "vpadal.s16 q13, q5 \n\t" - "vpadal.s16 q14, q6 \n\t" - "vpadal.s16 q15, q7 \n\t" - - "vpadd.s32 d0, d16, d17 \n\t" // reduce to int32 - "vpadd.s32 d1, d18, d19 \n\t" - "vpadd.s32 d2, d20, d21 \n\t" - "vpadd.s32 d3, d22, d23 \n\t" - "vpadd.s32 d4, d24, d25 \n\t" - "vpadd.s32 d5, d26, d27 \n\t" - "vpadd.s32 d6, d28, d29 \n\t" - "vpadd.s32 d7, d30, d31 \n\t" - - "vpadd.s32 d8, d0, d1 \n\t" // reduce to int32 again - "vpadd.s32 d9, d2, d3 \n\t" - "vpadd.s32 d10, d4, d5 \n\t" - "vpadd.s32 d11, d6, d7 \n\t" - - "vst1.32 {d8}, [%[c]], %[ldc] \n\t" - "vst1.32 {d9}, [%[c]], %[ldc] \n\t" - "vst1.32 {d10}, [%[c]], %[ldc] \n\t" - "vst1.32 {d11}, [%[c]] \n\t" - - : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c) - : [ldc] "r"(ldc) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -#undef PADDLE_LABEL_AFTER_LOOP -#undef PADDLE_LABEL_LOOP - -#endif // __aarch64__ -#endif // __ARM_NEON -} - -void Gemm::AddDot4x4(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc) { -#if __ARM_NEON -#if __aarch64__ -#define PADDLE_LABEL_LOOP "1" -#define PADDLE_LABEL_AFTER_LOOP "2" - asm volatile( - // load data from matrix a and b,and set zero to result register - "ld1 {v0.16b}, [%[b]], #16\n" - "dup v16.4s, wzr\n" - "ld1 {v4.16b}, [%[a]], #16\n" - "dup v17.4s, wzr\n" - "ld1 {v1.16b}, [%[b]], #16\n" - "dup v18.4s, wzr\n" - "ld1 {v5.16b}, [%[a]], #16\n" - "dup v19.4s, wzr\n" - "ld1 {v2.16b}, [%[b]], #16\n" - "dup v20.4s, wzr\n" - "ld1 {v3.16b}, [%[b]], #16\n" - "dup v21.4s, wzr\n" - "ld1 {v6.16b}, [%[a]], #16\n" - "dup v22.4s, wzr\n" - "ld1 {v7.16b}, [%[a]], #16\n" - "dup v23.4s, wzr\n" - "dup v24.4s, wzr\n" - "dup v25.4s, wzr\n" - "dup v26.4s, wzr\n" - "dup v27.4s, wzr\n" - "dup v28.4s, wzr\n" - "dup v29.4s, wzr\n" - "dup v30.4s, wzr\n" - "dup v31.4s, wzr\n" - - // Multiply ldc by 4 == sizeof(int32) - "lsl %[ldc], %[ldc], #2\n" - - // first half - "smull v8.8h, v0.8b, v4.8b\n" - "smull v9.8h, v1.8b, v4.8b\n" - "smull v10.8h, v2.8b, v4.8b\n" - "smull v11.8h, v3.8b, v4.8b\n" - "smull v12.8h, v0.8b, v5.8b\n" - "smull v13.8h, v1.8b, v5.8b\n" - "smull v14.8h, v2.8b, v5.8b\n" - "smull v15.8h, v3.8b, v5.8b\n" - - // Multiply-accumulate second-half - "smlal2 v8.8h, v0.16b, v4.16b\n" - "smlal2 v9.8h, v1.16b, v4.16b\n" - "smlal2 v10.8h, v2.16b, v4.16b\n" - "smlal2 v11.8h, v3.16b, v4.16b\n" - "smlal2 v12.8h, v0.16b, v5.16b\n" - "smlal2 v13.8h, v1.16b, v5.16b\n" - "smlal2 v14.8h, v2.16b, v5.16b\n" - "smlal2 v15.8h, v3.16b, v5.16b\n" - - "subs %[k], %[k], #16\n" - - // skip the loop - "beq " PADDLE_LABEL_AFTER_LOOP "f\n" - - // loop - PADDLE_LABEL_LOOP - ":\n" - - // first half - "sadalp v16.4s, v8.8h\n" - "ld1 {v4.16b}, [%[a]], #16\n" - "smull v8.8h, v0.8b, v6.8b\n" - "sadalp v17.4s, v9.8h\n" - "ld1 {v5.16b}, [%[a]], #16\n" - "smull v9.8h, v1.8b, v6.8b\n" - "sadalp v18.4s, v10.8h\n" - "smull v10.8h, v2.8b, v6.8b\n" - "sadalp v19.4s, v11.8h\n" - "smull v11.8h, v3.8b, v6.8b\n" - "sadalp v20.4s, v12.8h\n" - "smull v12.8h, v0.8b, v7.8b\n" - "sadalp v21.4s, v13.8h\n" - "smull v13.8h, v1.8b, v7.8b\n" - "sadalp v22.4s, v14.8h\n" - "smull v14.8h, v2.8b, v7.8b\n" - "sadalp v23.4s, v15.8h\n" - "smull v15.8h, v3.8b, v7.8b\n" - - // Multiply-accumulate second-half - "smlal2 v8.8h, v0.16b, v6.16b\n" - "smlal2 v9.8h, v1.16b, v6.16b\n" - "smlal2 v10.8h, v2.16b, v6.16b\n" - "smlal2 v11.8h, v3.16b, v6.16b\n" - - "ld1 {v6.16b}, [%[a]], #16\n" - - "smlal2 v12.8h, v0.16b, v7.16b\n" - "ld1 {v0.16b}, [%[b]], #16\n" - "smlal2 v13.8h, v1.16b, v7.16b\n" - "ld1 {v1.16b}, [%[b]], #16\n" - "smlal2 v14.8h, v2.16b, v7.16b\n" - "ld1 {v2.16b}, [%[b]], #16\n" - "smlal2 v15.8h, v3.16b, v7.16b\n" - "ld1 {v3.16b}, [%[b]], #16\n" - - // first half - "sadalp v24.4s, v8.8h\n" - "smull v8.8h, v0.8b, v4.8b\n" - "sadalp v25.4s, v9.8h\n" - "ld1 {v7.16b}, [%[a]], #16\n" - "smull v9.8h, v1.8b, v4.8b\n" - "sadalp v26.4s, v10.8h\n" - "smull v10.8h, v2.8b, v4.8b\n" - "sadalp v27.4s, v11.8h\n" - "smull v11.8h, v3.8b, v4.8b\n" - "sadalp v28.4s, v12.8h\n" - "smull v12.8h, v0.8b, v5.8b\n" - "sadalp v29.4s, v13.8h\n" - "smull v13.8h, v1.8b, v5.8b\n" - "sadalp v30.4s, v14.8h\n" - "smull v14.8h, v2.8b, v5.8b\n" - "sadalp v31.4s, v15.8h\n" - "smull v15.8h, v3.8b, v5.8b\n" - - // Multiply-accumulate second-half - "smlal2 v8.8h, v0.16b, v4.16b\n" - "smlal2 v9.8h, v1.16b, v4.16b\n" - "smlal2 v10.8h, v2.16b, v4.16b\n" - "smlal2 v11.8h, v3.16b, v4.16b\n" - - // Loop - "subs %[k], %[k], #16\n" - - "smlal2 v12.8h, v0.16b, v5.16b\n" - "smlal2 v13.8h, v1.16b, v5.16b\n" - "smlal2 v14.8h, v2.16b, v5.16b\n" - "smlal2 v15.8h, v3.16b, v5.16b\n" - - "bne " PADDLE_LABEL_LOOP "b\n" - - // Final - PADDLE_LABEL_AFTER_LOOP - ":\n" - - // first half - "sadalp v16.4s, v8.8h\n" - "smull v8.8h, v0.8b, v6.8b\n" - "sadalp v17.4s, v9.8h\n" - "smull v9.8h, v1.8b, v6.8b\n" - "sadalp v18.4s, v10.8h\n" - "smull v10.8h, v2.8b, v6.8b\n" - "sadalp v19.4s, v11.8h\n" - "smull v11.8h, v3.8b, v6.8b\n" - "sadalp v20.4s, v12.8h\n" - "smull v12.8h, v0.8b, v7.8b\n" - "sadalp v21.4s, v13.8h\n" - "smull v13.8h, v1.8b, v7.8b\n" - "sadalp v22.4s, v14.8h\n" - "smull v14.8h, v2.8b, v7.8b\n" - "sadalp v23.4s, v15.8h\n" - "smull v15.8h, v3.8b, v7.8b\n" - - // Multiply-accumulate second-half - "smlal2 v8.8h, v0.16b, v6.16b\n" - "smlal2 v9.8h, v1.16b, v6.16b\n" - "smlal2 v10.8h, v2.16b, v6.16b\n" - "smlal2 v11.8h, v3.16b, v6.16b\n" - "smlal2 v12.8h, v0.16b, v7.16b\n" - "smlal2 v13.8h, v1.16b, v7.16b\n" - "smlal2 v14.8h, v2.16b, v7.16b\n" - "smlal2 v15.8h, v3.16b, v7.16b\n" - - "sadalp v24.4s, v8.8h\n" - "sadalp v25.4s, v9.8h\n" - "sadalp v26.4s, v10.8h\n" - "sadalp v27.4s, v11.8h\n" - "sadalp v28.4s, v12.8h\n" - "sadalp v29.4s, v13.8h\n" - "sadalp v30.4s, v14.8h\n" - "sadalp v31.4s, v15.8h\n" - - // Reduce 32bit accumulators horizontally. - "addp v0.4s, v16.4s, v17.4s\n" - "addp v1.4s, v18.4s, v19.4s\n" - "addp v2.4s, v20.4s, v21.4s\n" - "addp v3.4s, v22.4s, v23.4s\n" - "addp v4.4s, v24.4s, v25.4s\n" - "addp v5.4s, v26.4s, v27.4s\n" - "addp v6.4s, v28.4s, v29.4s\n" - "addp v7.4s, v30.4s, v31.4s\n" - - // Reduce 32bit accumulators horizontally, second pass - // (each pass adds pairwise. we need to add 4-wise). - "addp v12.4s, v0.4s, v1.4s\n" - "addp v13.4s, v2.4s, v3.4s\n" - "addp v14.4s, v4.4s, v5.4s\n" - "addp v15.4s, v6.4s, v7.4s\n" - - "st1 {v12.4s}, [%[c]], %[ldc] \n\t" - "st1 {v13.4s}, [%[c]], %[ldc] \n\t" - "st1 {v14.4s}, [%[c]], %[ldc] \n\t" - "st1 {v15.4s}, [%[c]] \n\t" - - : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c) // outputs - : [ldc] "r"(ldc) // inputs - : "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", - "v28", "v29", "v30", "v31"); // clobbers -#undef PADDLE_LABEL_AFTER_LOOP -#undef PADDLE_LABEL_LOOP -#else -// AddDot4x2 used only for aarch64 -#endif // __aarch64__ -#endif // __ARM_NEON -} - -// 8 bits int small block inner product -void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc) { -#if __ARM_NEON -#if __aarch64__ -// AddDot6x8 used only for aarch32 -#else - const int8_t *a_ptr, *b_ptr; - a_ptr = a; - b_ptr = b; - int32_t kc1 = k >> 3; - int32_t kc2 = k & 7; - int32_t kc3 = kc2 >> 2; - int32_t kc4 = kc2 & 3; - int32_t kc5 = kc4 >> 1; - int32_t kc6 = kc4 & 1; - int32_t step = sizeof(int32_t) * ldc; - asm volatile( - // q4-q15: save 48 results - "pld [%[a_ptr]] \n\t" - "pld [%[b_ptr]] \n\t" - "pld [%[b_ptr], #64] \n\t" - "vmov.s32 q4, #0 \n\t" - "vmov.s32 q5, q4 \n\t" - "vmov.s32 q6, q4 \n\t" - "vmov.s32 q7, q4 \n\t" - "vmov.s32 q8, q4 \n\t" - "vmov.s32 q9, q4 \n\t" - "vmov.s32 q10, q4 \n\t" - "vmov.s32 q11, q4 \n\t" - "vmov.s32 q12, q4 \n\t" - "vmov.s32 q13, q4 \n\t" - "vmov.s32 q14, q4 \n\t" - "vmov.s32 q15, q4 \n\t" - "mov r0, #12 \n\t" - "subs %[kc1], %[kc1], #1 \n\t" - "blt 1f \n\t" - "0: \n\t" - "pld [%[a_ptr], #64] \n\t" - "pld [%[b_ptr], #128] \n\t" - "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 1st row - "vmovl.s8 q2, d0 \n\t" - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d4[0]\n\t" - "vmlal.s16 q5, d7, d4[0]\n\t" - "vmlal.s16 q6, d6, d4[1]\n\t" - "vmlal.s16 q7, d7, d4[1]\n\t" - "vmlal.s16 q8, d6, d4[2]\n\t" - "vmlal.s16 q9, d7, d4[2]\n\t" - "vmlal.s16 q10, d6, d4[3]\n\t" - "vmlal.s16 q11, d7, d4[3]\n\t" - "vmlal.s16 q12, d6, d5[0]\n\t" - "vmlal.s16 q13, d7, d5[0]\n\t" - "vmlal.s16 q14, d6, d5[1]\n\t" - "vmlal.s16 q15, d7, d5[1]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 2nd row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d5[2]\n\t" - "vmlal.s16 q5, d7, d5[2]\n\t" - "vmlal.s16 q6, d6, d5[3]\n\t" - "vmlal.s16 q7, d7, d5[3]\n\t" - "vmovl.s8 q2, d1 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 3th row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d5[0]\n\t" - "vmlal.s16 q5, d7, d5[0]\n\t" - "vmlal.s16 q6, d6, d5[1]\n\t" - "vmlal.s16 q7, d7, d5[1]\n\t" - "vmlal.s16 q8, d6, d5[2]\n\t" - "vmlal.s16 q9, d7, d5[2]\n\t" - "vmlal.s16 q10, d6, d5[3]\n\t" - "vmlal.s16 q11, d7, d5[3]\n\t" - "vmovl.s8 q2, d2 \n\t" - "vmlal.s16 q12, d6, d4[0]\n\t" - "vmlal.s16 q13, d7, d4[0]\n\t" - "vmlal.s16 q14, d6, d4[1]\n\t" - "vmlal.s16 q15, d7, d4[1]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 4th row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d4[2]\n\t" - "vmlal.s16 q5, d7, d4[2]\n\t" - "vmlal.s16 q6, d6, d4[3]\n\t" - "vmlal.s16 q7, d7, d4[3]\n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - - "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 1st row - "vmovl.s8 q2, d0 \n\t" - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d4[0]\n\t" - "vmlal.s16 q5, d7, d4[0]\n\t" - "vmlal.s16 q6, d6, d4[1]\n\t" - "vmlal.s16 q7, d7, d4[1]\n\t" - "vmlal.s16 q8, d6, d4[2]\n\t" - "vmlal.s16 q9, d7, d4[2]\n\t" - "vmlal.s16 q10, d6, d4[3]\n\t" - "vmlal.s16 q11, d7, d4[3]\n\t" - "vmlal.s16 q12, d6, d5[0]\n\t" - "vmlal.s16 q13, d7, d5[0]\n\t" - "vmlal.s16 q14, d6, d5[1]\n\t" - "vmlal.s16 q15, d7, d5[1]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 2nd row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d5[2]\n\t" - "vmlal.s16 q5, d7, d5[2]\n\t" - "vmlal.s16 q6, d6, d5[3]\n\t" - "vmlal.s16 q7, d7, d5[3]\n\t" - "vmovl.s8 q2, d1 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 3th row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d5[0]\n\t" - "vmlal.s16 q5, d7, d5[0]\n\t" - "vmlal.s16 q6, d6, d5[1]\n\t" - "vmlal.s16 q7, d7, d5[1]\n\t" - "vmlal.s16 q8, d6, d5[2]\n\t" - "vmlal.s16 q9, d7, d5[2]\n\t" - "vmlal.s16 q10, d6, d5[3]\n\t" - "vmlal.s16 q11, d7, d5[3]\n\t" - "vmovl.s8 q2, d2 \n\t" - "vmlal.s16 q12, d6, d4[0]\n\t" - "vmlal.s16 q13, d7, d4[0]\n\t" - "vmlal.s16 q14, d6, d4[1]\n\t" - "vmlal.s16 q15, d7, d4[1]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 4th row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d4[2]\n\t" - "vmlal.s16 q5, d7, d4[2]\n\t" - "vmlal.s16 q6, d6, d4[3]\n\t" - "vmlal.s16 q7, d7, d4[3]\n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - - "subs %[kc1], %[kc1], #1 \n\t" - "bge 0b \n\t" - "1: \n\t" // last <8 rows - "subs %[kc3], %[kc3], #1 \n\t" - "blt 2f \n\t" - "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 1st row - "vmovl.s8 q2, d0 \n\t" - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d4[0]\n\t" - "vmlal.s16 q5, d7, d4[0]\n\t" - "vmlal.s16 q6, d6, d4[1]\n\t" - "vmlal.s16 q7, d7, d4[1]\n\t" - "vmlal.s16 q8, d6, d4[2]\n\t" - "vmlal.s16 q9, d7, d4[2]\n\t" - "vmlal.s16 q10, d6, d4[3]\n\t" - "vmlal.s16 q11, d7, d4[3]\n\t" - "vmlal.s16 q12, d6, d5[0]\n\t" - "vmlal.s16 q13, d7, d5[0]\n\t" - "vmlal.s16 q14, d6, d5[1]\n\t" - "vmlal.s16 q15, d7, d5[1]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 2nd row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d5[2]\n\t" - "vmlal.s16 q5, d7, d5[2]\n\t" - "vmlal.s16 q6, d6, d5[3]\n\t" - "vmlal.s16 q7, d7, d5[3]\n\t" - "vmovl.s8 q2, d1 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 3th row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d5[0]\n\t" - "vmlal.s16 q5, d7, d5[0]\n\t" - "vmlal.s16 q6, d6, d5[1]\n\t" - "vmlal.s16 q7, d7, d5[1]\n\t" - "vmlal.s16 q8, d6, d5[2]\n\t" - "vmlal.s16 q9, d7, d5[2]\n\t" - "vmlal.s16 q10, d6, d5[3]\n\t" - "vmlal.s16 q11, d7, d5[3]\n\t" - "vmovl.s8 q2, d2 \n\t" - "vmlal.s16 q12, d6, d4[0]\n\t" - "vmlal.s16 q13, d7, d4[0]\n\t" - "vmlal.s16 q14, d6, d4[1]\n\t" - "vmlal.s16 q15, d7, d4[1]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 4th row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d4[2]\n\t" - "vmlal.s16 q5, d7, d4[2]\n\t" - "vmlal.s16 q6, d6, d4[3]\n\t" - "vmlal.s16 q7, d7, d4[3]\n\t" - "vmlal.s16 q8, d6, d5[0]\n\t" - "vmlal.s16 q9, d7, d5[0]\n\t" - "vmlal.s16 q10, d6, d5[1]\n\t" - "vmlal.s16 q11, d7, d5[1]\n\t" - "vmlal.s16 q12, d6, d5[2]\n\t" - "vmlal.s16 q13, d7, d5[2]\n\t" - "vmlal.s16 q14, d6, d5[3]\n\t" - "vmlal.s16 q15, d7, d5[3]\n\t" - - "2: \n\t" // last <4 rows - "subs %[kc5], %[kc5], #1 \n\t" - "blt 3f \n\t" - "vld1.s8 {d0, d1}, [%[a_ptr]], r0 \n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 1st row - "vmovl.s8 q2, d0 \n\t" - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d4[0]\n\t" - "vmlal.s16 q5, d7, d4[0]\n\t" - "vmlal.s16 q6, d6, d4[1]\n\t" - "vmlal.s16 q7, d7, d4[1]\n\t" - "vmlal.s16 q8, d6, d4[2]\n\t" - "vmlal.s16 q9, d7, d4[2]\n\t" - "vmlal.s16 q10, d6, d4[3]\n\t" - "vmlal.s16 q11, d7, d4[3]\n\t" - "vmlal.s16 q12, d6, d5[0]\n\t" - "vmlal.s16 q13, d7, d5[0]\n\t" - "vmlal.s16 q14, d6, d5[1]\n\t" - "vmlal.s16 q15, d7, d5[1]\n\t" - "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 2nd row - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d5[2]\n\t" - "vmlal.s16 q5, d7, d5[2]\n\t" - "vmlal.s16 q6, d6, d5[3]\n\t" - "vmlal.s16 q7, d7, d5[3]\n\t" - "vmovl.s8 q2, d1 \n\t" - "vmlal.s16 q8, d6, d4[0]\n\t" - "vmlal.s16 q9, d7, d4[0]\n\t" - "vmlal.s16 q10, d6, d4[1]\n\t" - "vmlal.s16 q11, d7, d4[1]\n\t" - "vmlal.s16 q12, d6, d4[2]\n\t" - "vmlal.s16 q13, d7, d4[2]\n\t" - "vmlal.s16 q14, d6, d4[3]\n\t" - "vmlal.s16 q15, d7, d4[3]\n\t" - "3: \n\t" // last <2 rows - "subs %[kc6], %[kc6], #1 \n\t" - "blt 4f \n\t" - "vld1.s8 {d0}, [%[a_ptr]] \n\t" - "vld1.s8 {d3}, [%[b_ptr]] \n\t" - "vmovl.s8 q2, d0 \n\t" - "vmovl.s8 q3, d3 \n\t" - "vmlal.s16 q4, d6, d4[0]\n\t" - "vmlal.s16 q5, d7, d4[0]\n\t" - "vmlal.s16 q6, d6, d4[1]\n\t" - "vmlal.s16 q7, d7, d4[1]\n\t" - "vmlal.s16 q8, d6, d4[2]\n\t" - "vmlal.s16 q9, d7, d4[2]\n\t" - "vmlal.s16 q10, d6, d4[3]\n\t" - "vmlal.s16 q11, d7, d4[3]\n\t" - "vmlal.s16 q12, d6, d5[0]\n\t" - "vmlal.s16 q13, d7, d5[0]\n\t" - "vmlal.s16 q14, d6, d5[1]\n\t" - "vmlal.s16 q15, d7, d5[1]\n\t" - "4: \n\t" - "vst1.32 {q4, q5}, [%[c]], %[step] \n\t" - "vst1.32 {q6, q7}, [%[c]], %[step] \n\t" - "vst1.32 {q8, q9}, [%[c]], %[step] \n\t" - "vst1.32 {q10, q11}, [%[c]], %[step] \n\t" - "vst1.32 {q12, q13}, [%[c]], %[step] \n\t" - "vst1.32 {q14, q15}, [%[c]] \n\t" - : - : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), - [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step) - : "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -#endif // __aarch64__ -#endif // __ARM_NEON -} - -// 8 bits int inner product -template <> -void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a, - const int8_t *b, float beta, int32_t *c, int8_t *C, - int32_t ldc, bool relu) {} -template <> -void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a, - const int8_t *b, float beta, int32_t *c, int32_t *C, - int32_t ldc, bool relu) { -#pragma omp parallel for - for (int32_t j = 0; j < nc; j += NR_INT8) { - for (int32_t i = 0; i < mc; i += MR_INT8) { -#if __aarch64__ - AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#else - AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#endif // __aarch64__ - } - } - if (!relu) { - WriteBasic(mc, nc, c, C, ldc); - return; - } -} - -template <> -void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, - const int8_t *a, const int8_t *b, float beta, - int32_t *c, int8_t *C, int32_t ldc, bool relu, - int32_t *bias, bool addOnRow) { -#pragma omp parallel for - for (int32_t j = 0; j < nc; j += NR_INT8) { - for (int32_t i = 0; i < mc; i += MR_INT8) { -#if __aarch64__ - AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#else - AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#endif // __aarch64__ - } - } - if (relu) { - WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha); - return; - } else { - if (addOnRow) { - WriteWithAddScaleT(mc, nc, c, C, ldc, bias, alpha); - } else { - WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha); - } - } -} - -template <> -void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, - const int8_t *a, const int8_t *b, float beta, - int32_t *c, int32_t *C, int32_t ldc, bool relu, - int32_t *bias, bool addOnRow) {} - -// 8 bits int PackMatrixA_4r -void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, - int32_t lda, int8_t *buffer) { - const int8_t *a0, *a1, *a2, *a3; - for (int32_t i = 0; i < m - m_tail; i += 4) { - a0 = A + i * lda; - a1 = A + (i + 1) * lda; - a2 = A + (i + 2) * lda; - a3 = A + (i + 3) * lda; - for (int32_t j = 0; j < k; ++j) { - *buffer++ = *a0++; - *buffer++ = *a1++; - *buffer++ = *a2++; - *buffer++ = *a3++; - } - } - - if (m_tail != 0) { - a0 = &A(m - m_tail, 0); - a1 = a0 + lda; - a2 = a0 + 2 * lda; - a3 = a0 + 3 * lda; - switch (m_tail) { - case 1: - a1 = zero_int8; - case 2: - a2 = zero_int8; - case 3: - a3 = zero_int8; - break; - default: - break; - } - for (int j = 0; j < k; ++j) { - *buffer++ = *a0++; - *buffer++ = *a1++; - *buffer++ = *a2++; - *buffer++ = *a3++; - } - } -} - -// 8 bits int PackMatrixA_6r -void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, - int32_t lda, int8_t *buffer) { - const int32_t i_length = m - m_tail; - for (int32_t i = 0; i < i_length; i += 6) { - const int8_t *a0 = A + i * lda; - const int8_t *a1 = A + (i + 1) * lda; - const int8_t *a2 = A + (i + 2) * lda; - const int8_t *a3 = A + (i + 3) * lda; - const int8_t *a4 = A + (i + 4) * lda; - const int8_t *a5 = A + (i + 5) * lda; - int8_t *local_buffer = buffer + i * k; - for (int32_t j = 0; j < k; ++j) { - *local_buffer++ = *a0++; - *local_buffer++ = *a1++; - *local_buffer++ = *a2++; - *local_buffer++ = *a3++; - *local_buffer++ = *a4++; - *local_buffer++ = *a5++; - } - } - if (m_tail != 0) { - const int8_t *a0 = &A(i_length, 0); - const int8_t *a1 = a0 + lda; - const int8_t *a2 = a0 + 2 * lda; - const int8_t *a3 = a0 + 3 * lda; - const int8_t *a4 = a0 + 4 * lda; - const int8_t *a5 = a0 + 5 * lda; - int8_t *local_buffer = buffer + i_length * k; - switch (m_tail) { - case 1: - a1 = zero_int8; - case 2: - a2 = zero_int8; - case 3: - a3 = zero_int8; - case 4: - a4 = zero_int8; - case 5: - a5 = zero_int8; - break; - default: - break; - } - for (int32_t j = 0; j < k; ++j) { - *local_buffer++ = *a0++; - *local_buffer++ = *a1++; - *local_buffer++ = *a2++; - *local_buffer++ = *a3++; - *local_buffer++ = *a4++; - *local_buffer++ = *a5++; - } - } -} - -// 8 bits int PackMatrixB -void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, - int32_t ldb, int8_t *buffer) { - const int32_t j_length = n - n_tail; - for (int32_t j = 0; j < j_length; j += 8) { - int8_t *local_buffer = buffer + j * k; - for (int32_t i = 0; i < k; ++i) { - const int8_t *b0 = &B(i, j); -#if __ARM_NEON -#if __aarch64__ -// PackMatrixB_8c used only for aarch32 -#else - asm volatile( - // "pld [%[b0]] \n\t" - "vld1.s8 {d0}, [%[b0]] \n\t" - "vst1.s8 {d0}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer) - : [b0] "r"(b0) - : "memory", "q0"); -#endif // __aarch64__ -#else - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; -#endif // __ARM_NEON - } - } - if (n_tail != 0) { - int8_t *local_buffer = buffer + j_length * k; - for (int32_t i = 0; i < k; ++i) { - const int8_t *b0 = &B(i, j_length); - for (int32_t j = j_length; j < n; ++j) { - *local_buffer++ = *b0++; - } - for (int32_t j = n; j < j_length + 8; ++j) { - *local_buffer++ = 0; - } - } - } -} - -// 8 bits int PackMatrixA_4r -void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, - const int8_t *A, int32_t lda, int8_t *buffer) { - const int32_t i_length = m - m_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; - - for (int32_t i = 0; i < i_length; i += 4) { - const int8_t *a0 = A + i * lda; - const int8_t *a1 = A + (i + 1) * lda; - const int8_t *a2 = A + (i + 2) * lda; - const int8_t *a3 = A + (i + 3) * lda; - int8_t *local_buffer = buffer + i * KC; - for (int32_t j = 0; j < k_count; ++j) { -#if __ARM_NEON -#if __aarch64__ - asm volatile( - "ld1 {v0.16b}, [%[a0]], #16 \n\t" - "ld1 {v1.16b}, [%[a1]], #16 \n\t" - "ld1 {v2.16b}, [%[a2]], #16 \n\t" - "ld1 {v3.16b}, [%[a3]], #16 \n\t" - "st1 {v0.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v1.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v2.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v3.16b}, [%[local_buffer]], #16 \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "v0", "v1", "v2", "v3"); -#else - asm volatile( - "vld1.s8 {d0, d1}, [%[a0]]! \n\t" - "vld1.s8 {d2, d3}, [%[a1]]! \n\t" - "vld1.s8 {d4, d5}, [%[a2]]! \n\t" - "vld1.s8 {d6, d7}, [%[a3]]! \n\t" - "vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t" - "vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t" - "vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t" - "vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#else - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a0++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a1++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a2++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a3++; - } -#endif // __ARM_NEON - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a0++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a1++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a2++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a3++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } - - if (m_tail != 0) { - const int8_t *a0 = &A(i_length, 0); - const int8_t *a1 = a0 + lda; - const int8_t *a2 = a0 + 2 * lda; - const int8_t *a3 = a0 + 3 * lda; - int8_t *local_buffer = buffer + i_length * KC; - switch (m_tail) { - case 1: - a1 = zero_int8; - case 2: - a2 = zero_int8; - case 3: - a3 = zero_int8; - break; - default: - break; - } - for (int32_t j = 0; j < k_count; ++j) { -#if __ARM_NEON -#if __aarch64__ - asm volatile( - "ld1 {v0.16b}, [%[a0]], #16 \n\t" - "ld1 {v1.16b}, [%[a1]], #16 \n\t" - "ld1 {v2.16b}, [%[a2]], #16 \n\t" - "ld1 {v3.16b}, [%[a3]], #16 \n\t" - "st1 {v0.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v1.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v2.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v3.16b}, [%[local_buffer]], #16 \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "v0", "v1", "v2", "v3"); -#else - asm volatile( - "vld1.s8 {d0, d1}, [%[a0]]! \n\t" - "vld1.s8 {d2, d3}, [%[a1]]! \n\t" - "vld1.s8 {d4, d5}, [%[a2]]! \n\t" - "vld1.s8 {d6, d7}, [%[a3]]! \n\t" - "vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t" - "vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t" - "vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t" - "vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#else - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a0++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a1++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a2++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a3++; - } -#endif // __ARM_NEON - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a0++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a1++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a2++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a3++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - -// 8 bits int PackMatrixB -void Gemm::PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer) { - const int32_t j_length = n - n_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; - for (int32_t j = 0; j < j_length; j += 2) { - int8_t *local_buffer = buffer + j * KC; - for (int32_t i = 0; i < k_count; ++i) { - const int8_t *b0 = &B((i << 4), j); - const int8_t *b1 = &B((i << 4), j + 1); - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b1; - b1 += ldb; - } - } - if (k_tail != 0) { - const int8_t *b0 = &B((k_count << 4), j); - const int8_t *b1 = &B((k_count << 4), j + 1); - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b1; - b1 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } - if (n_tail != 0) { - int8_t *local_buffer = buffer + j_length * KC; - for (int32_t i = 0; i < k_count; ++i) { - const int8_t *b0 = &B((i << 4), j_length); - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = 0; - } - } - if (k_tail != 0) { - const int8_t *b0 = &B((k_count << 4), j_length); - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - -void Gemm::PackMatrixB_4c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer) { - const int32_t j_length = n - n_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; - for (int32_t j = 0; j < n; j += 4) { - int8_t *local_buffer = buffer + j * KC; - const int8_t *b0 = &B(0, j); - const int8_t *b1 = b0 + 1; - const int8_t *b2 = b0 + 2; - const int8_t *b3 = b0 + 3; - if (j > j_length) { - switch (n_tail) { - case 1: - b1 = zero_int8; - case 2: - b2 = zero_int8; - case 3: - b3 = zero_int8; - break; - default: - break; - } - } - - for (int32_t i = 0; i < k_count; ++i) { - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b1; - b1 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b2; - b2 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b3; - b3 += ldb; - } - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b1; - b1 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b2; - b2 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b3; - b3 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - -// 8 bits int write back -// C = A * B -void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, - int32_t ldc) { -#if __ARM_NEON -#if __aarch64__ - int32_t nc1 = nc / 4; - int32_t _nc1 = nc % 4; - - int32_t *c_ptr, *C_ptr; - int32x4_t cv; - for (int32_t i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - for (int32_t j = 0; j < nc1; ++j) { - cv = vld1q_s32(c_ptr); - vst1q_s32(C_ptr, cv); - c_ptr += 4; - C_ptr += 4; - } - if (_nc1 != 0) { - cv = vld1q_s32(c_ptr); - if (_nc1 >= 1) { - vst1q_lane_s32(C_ptr, cv, 0); - C_ptr++; - } - if (_nc1 >= 2) { - vst1q_lane_s32(C_ptr, cv, 1); - C_ptr++; - } - if (_nc1 >= 3) { - vst1q_lane_s32(C_ptr, cv, 2); - } - } - } -#else - int32_t nc1 = nc >> 4; - int32_t _nc1 = nc & 15; - int32_t step = sizeof(int32_t) * ldc; - int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4)); - int32_t volatile m = mc; - int32_t volatile n = nc1; - int32_t *volatile c_ptr, *volatile C_ptr; - int32_t *C0, *c0; - c_ptr = c; - C_ptr = C; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "loop_mc_%=: \n\t" - - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vst1.32 {q0, q1}, [r6]! \n\t" - - "vld1.32 {q2, q3}, [%[c_ptr]]! \n\t" - "vst1.32 {q2, q3}, [r6]! \n\t" - - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), - [step] "r"(step), [step1] "r"(step1) - : "memory", "r5", "r6", "q0", "q1", "q2", "q3"); - } - - if (_nc1 != 0) { - for (int32_t i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 16 + i * ldc; - c0 = c_ptr + nc1 * 16 + i * NC; - for (int32_t j = 0; j < _nc1; j++) { - *C0++ = *c0++; - } - } - } -#endif // __aarch64__ -#endif // __ARM_NEON -} - -// C = A * B + bias, scale * C, bias is added on column -void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale) { -#if __ARM_NEON -#if __aarch64__ - int32_t nc1 = nc >> 3; - int32_t _nc1 = nc & 7; - - int32_t *c_ptr; - int8_t *C_ptr; - int32x4_t cv0; - int32x4_t cv1; - int16x8_t cv_h; - int8x8_t cv_b; - int32x4_t biasv; - int8_t min = -127; - int8x8_t minv = vdup_n_s8(min); - for (int32_t i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - biasv = vld1q_dup_s32(bias + i); - for (int32_t j = 0; j < nc1; ++j) { - cv0 = vld1q_s32(c_ptr); - cv1 = vld1q_s32(c_ptr + 4); - cv0 = vqaddq_s32(cv0, biasv); - cv1 = vqaddq_s32(cv1, biasv); - - cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1)); - cv_b = vqmovn_s16(cv_h); - - cv_b = vmax_s8(cv_b, minv); - vst1_s8(C_ptr, cv_b); - c_ptr += 8; - C_ptr += 8; - } - if (_nc1 != 0) { - cv0 = vld1q_s32(c_ptr); - cv1 = vld1q_s32(c_ptr + 4); - cv0 = vqaddq_s32(cv0, biasv); - cv1 = vqaddq_s32(cv1, biasv); - - cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1)); - cv_b = vqmovn_s16(cv_h); - - cv_b = vmax_s8(cv_b, minv); - - switch (_nc1) { - case 7: - vst1_lane_s8(C_ptr + 6, cv_b, 6); - case 6: - vst1_lane_s8(C_ptr + 5, cv_b, 5); - case 5: - vst1_lane_s8(C_ptr + 4, cv_b, 4); - case 4: - vst1_lane_s8(C_ptr + 3, cv_b, 3); - case 3: - vst1_lane_s8(C_ptr + 2, cv_b, 2); - case 2: - vst1_lane_s8(C_ptr + 1, cv_b, 1); - case 1: - vst1_lane_s8(C_ptr, cv_b, 0); - default: - break; - } - } - } -#else - int8_t narrow = -128; - int32_t nc1 = nc >> 3; - int32_t _nc1 = nc & 7; - int32_t step = sizeof(int8_t) * ldc; - int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3)); - int32_t volatile m = mc; - int32_t volatile n = nc1; - int32_t *volatile c_ptr, *volatile bias_ptr; - int8_t *volatile C_ptr; - c_ptr = c; - C_ptr = C; - bias_ptr = bias; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "vdup.32 q15, %[scale] \n\t" - "vdup.8 d24, %[narrow] \n\t" - "loop_mc_%=: \n\t" - "vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t" - "vdup.32 q13, d26[0] \n\t" - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vqadd.s32 q1, q1, q13 \n\t" - "vcvt.f32.s32 q2, q0 \n\t" - "vcvt.f32.s32 q3, q1 \n\t" - "vmul.f32 q2, q2, q15 \n\t" - "vmul.f32 q3, q3, q15 \n\t" - "vcvt.s32.f32 q4, q2 \n\t" - "vcvt.s32.f32 q5, q3 \n\t" - "vqmovn.s32 d12, q4 \n\t" - "vqmovn.s32 d13, q5 \n\t" - "vqmovn.s16 d14, q6 \n\t" - "vceq.s8 d15, d14, d24 \n\t" - "vsub.s8 d14, d14, d15 \n\t" - "vst1.8 {d14}, [r6]! \n\t" - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), - [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr), - [scale] "r"(scale), [narrow] "r"(narrow) - : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q12", "q13", "q15"); - } - - int32_t nc_left; - int32_t *c0; - int8_t *C0; - int32_t bias_v; - if (_nc1 != 0) { - for (int32_t i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 8 + i * ldc; - c0 = c_ptr + nc1 * 8 + i * NC; - bias_v = *(bias_ptr + i); - nc_left = _nc1; - asm volatile( - "vdup.32 q15, %[scale] \n\t" - "vdup.8 d24, %[narrow] \n\t" - "vdup.32 q13, %[bias_v] \n\t" - "cmp %[_nc1], #4 \n\t" - "blt less_four_%= \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "vceq.s8 d9, d8, d24 \n\t" - "vsub.s8 d8, d8, d9 \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vst1.8 {d8[1]}, [%[C0]]! \n\t" - "vst1.8 {d8[2]}, [%[C0]]! \n\t" - "vst1.8 {d8[3]}, [%[C0]]! \n\t" - "subs %[_nc1], %[_nc1], #4 \n\t" - "beq process_over_%= \n\t" - "less_four_%=: \n\t" - "vld1.32 {q0}, [%[c0]] \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "vceq.s8 d9, d8, d24 \n\t" - "vsub.s8 d8, d8, d9 \n\t" - "loop_save_%=: \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vext.8 d8, d8, d8, #1 \n\t" - "subs %[_nc1], %[_nc1], #1 \n\t" - "bgt loop_save_%= \n\t" - "process_over_%=: \n\t" - : - : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), - [bias_v] "r"(bias_v), [scale] "r"(scale), [narrow] "r"(narrow) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15"); - } - } -#endif // __aarch64__ -#endif // __ARM_NEON -} - -// C = A * B + bias, scale * C, bias is added on row -void Gemm::WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale) { -#if __ARM_NEON -#if __aarch64__ - int32_t nc1 = nc >> 3; - int32_t _nc1 = nc & 7; - - int32_t *c_ptr; - int8_t *C_ptr; - int32x4_t cv0; - int32x4_t cv1; - int16x8_t cv_h; - int8x8_t cv_b; - int32_t *bias_ptr; - int32x4_t biasv0; - int32x4_t biasv1; - int8_t min = -127; - int8x8_t minv = vdup_n_s8(min); - for (int32_t i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - bias_ptr = bias; - for (int32_t j = 0; j < nc1; ++j) { - cv0 = vld1q_s32(c_ptr); - cv1 = vld1q_s32(c_ptr + 4); - biasv0 = vld1q_s32(bias_ptr); - biasv1 = vld1q_s32(bias_ptr + 4); - cv0 = vqaddq_s32(cv0, biasv0); - cv1 = vqaddq_s32(cv1, biasv1); - - cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1)); - cv_b = vqmovn_s16(cv_h); - - cv_b = vmax_s8(cv_b, minv); - vst1_s8(C_ptr, cv_b); - c_ptr += 8; - C_ptr += 8; - bias_ptr += 8; - } - if (_nc1 != 0) { - cv0 = vld1q_s32(c_ptr); - cv1 = vld1q_s32(c_ptr + 4); - biasv0 = vld1q_s32(bias_ptr); - biasv1 = vld1q_s32(bias_ptr + 4); - cv0 = vqaddq_s32(cv0, biasv0); - cv1 = vqaddq_s32(cv1, biasv1); - - cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1)); - cv_b = vqmovn_s16(cv_h); - - cv_b = vmax_s8(cv_b, minv); - - switch (_nc1) { - case 7: - vst1_lane_s8(C_ptr + 6, cv_b, 6); - case 6: - vst1_lane_s8(C_ptr + 5, cv_b, 5); - case 5: - vst1_lane_s8(C_ptr + 4, cv_b, 4); - case 4: - vst1_lane_s8(C_ptr + 3, cv_b, 3); - case 3: - vst1_lane_s8(C_ptr + 2, cv_b, 2); - case 2: - vst1_lane_s8(C_ptr + 1, cv_b, 1); - case 1: - vst1_lane_s8(C_ptr, cv_b, 0); - default: - break; - } - } - } -#else - int8_t narrow = -128; - int32_t nc1 = nc >> 3; - int32_t _nc1 = nc & 7; - int32_t step = sizeof(int8_t) * ldc; - int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3)); - int32_t volatile m = mc; - int32_t volatile n = nc1; - int32_t *volatile c_ptr, *volatile bias_ptr; - int8_t *volatile C_ptr; - c_ptr = c; - C_ptr = C; - bias_ptr = bias; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "vdup.32 q15, %[scale] \n\t" - "vdup.8 d24, %[narrow] \n\t" - "loop_mc_%=: \n\t" - "mov r4, %[bias_ptr] \n\t" - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - "vld1.32 {q13, q14}, [r4]! \n\t" - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vqadd.s32 q1, q1, q14 \n\t" - "vcvt.f32.s32 q2, q0 \n\t" - "vcvt.f32.s32 q3, q1 \n\t" - "vmul.f32 q2, q2, q15 \n\t" - "vmul.f32 q3, q3, q15 \n\t" - "vcvt.s32.f32 q4, q2 \n\t" - "vcvt.s32.f32 q5, q3 \n\t" - "vqmovn.s32 d12, q4 \n\t" - "vqmovn.s32 d13, q5 \n\t" - "vqmovn.s16 d14, q6 \n\t" - "vceq.s8 d15, d14, d24 \n\t" - "vsub.s8 d14, d14, d15 \n\t" - "vst1.8 {d14}, [r6]! \n\t" - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), - [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr), - [scale] "r"(scale), [narrow] "r"(narrow) - : "cc", "memory", "r4", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", - "q6", "q7", "q12", "q13", "q15"); - } - - int32_t nc_left; - int32_t *c0; - int8_t *C0; - int32_t *volatile bias0 = bias_ptr + nc1 * 8; - if (_nc1 != 0) { - for (int32_t i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 8 + i * ldc; - c0 = c_ptr + nc1 * 8 + i * NC; - nc_left = _nc1; - asm volatile( - "vdup.32 q15, %[scale] \n\t" - "vdup.8 d24, %[narrow] \n\t" - "cmp %[_nc1], #4 \n\t" - "blt less_four_%= \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" - "vld1.32 {q13}, [%[bias0]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "vceq.s8 d9, d8, d24 \n\t" - "vsub.s8 d8, d8, d9 \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vst1.8 {d8[1]}, [%[C0]]! \n\t" - "vst1.8 {d8[2]}, [%[C0]]! \n\t" - "vst1.8 {d8[3]}, [%[C0]]! \n\t" - "subs %[_nc1], %[_nc1], #4 \n\t" - "beq process_over_%= \n\t" - "less_four_%=: \n\t" - "vld1.32 {q0}, [%[c0]] \n\t" - "vld1.32 {q13}, [%[bias0]] \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "vceq.s8 d9, d8, d24 \n\t" - "vsub.s8 d8, d8, d9 \n\t" - "loop_save_%=: \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vext.8 d8, d8, d8, #1 \n\t" - "subs %[_nc1], %[_nc1], #1 \n\t" - "bgt loop_save_%= \n\t" - "process_over_%=: \n\t" - : - : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), [bias0] "r"(bias0), - [scale] "r"(scale), [narrow] "r"(narrow) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15"); - } - } -#endif // __aarch64__ -#endif // __ARM_NEON -} - -// C = A * B + bias, scale * relu(C), bias is added on column -void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale) { -#if __ARM_NEON -#if __aarch64__ - int32_t nc1 = nc >> 3; - int32_t _nc1 = nc & 7; - - int32_t *c_ptr; - int8_t *C_ptr; - int32x4_t cv0; - int32x4_t cv1; - int16x8_t cv_h; - int8x8_t cv_b; - int32x4_t biasv; - int32x4_t zero = vdupq_n_s32(0); - for (int32_t i = 0; i < mc; ++i) { - c_ptr = c + i * NC; - C_ptr = C + i * ldc; - biasv = vld1q_dup_s32(bias + i); - for (int32_t j = 0; j < nc1; ++j) { - cv0 = vld1q_s32(c_ptr); - cv1 = vld1q_s32(c_ptr + 4); - cv0 = vqaddq_s32(cv0, biasv); - cv1 = vqaddq_s32(cv1, biasv); - cv0 = vmaxq_s32(cv0, zero); - cv1 = vmaxq_s32(cv1, zero); - - cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1)); - cv_b = vqmovn_s16(cv_h); - - vst1_s8(C_ptr, cv_b); - c_ptr += 8; - C_ptr += 8; - } - if (_nc1 != 0) { - cv0 = vld1q_s32(c_ptr); - cv1 = vld1q_s32(c_ptr + 4); - cv0 = vqaddq_s32(cv0, biasv); - cv1 = vqaddq_s32(cv1, biasv); - cv0 = vmaxq_s32(cv0, zero); - cv1 = vmaxq_s32(cv1, zero); - - cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1)); - cv_b = vqmovn_s16(cv_h); - - switch (_nc1) { - case 7: - vst1_lane_s8(C_ptr + 6, cv_b, 6); - case 6: - vst1_lane_s8(C_ptr + 5, cv_b, 5); - case 5: - vst1_lane_s8(C_ptr + 4, cv_b, 4); - case 4: - vst1_lane_s8(C_ptr + 3, cv_b, 3); - case 3: - vst1_lane_s8(C_ptr + 2, cv_b, 2); - case 2: - vst1_lane_s8(C_ptr + 1, cv_b, 1); - case 1: - vst1_lane_s8(C_ptr, cv_b, 0); - default: - break; - } - } - } -#else - int32_t zero = 0; - int32_t nc1 = nc >> 3; - int32_t _nc1 = nc & 7; - int32_t step = sizeof(int8_t) * ldc; - int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3)); - int32_t volatile m = mc; - int32_t volatile n = nc1; - int32_t *volatile c_ptr, *volatile bias_ptr; - int8_t *volatile C_ptr; - c_ptr = c; - C_ptr = C; - bias_ptr = bias; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "vdup.32 q15, %[scale] \n\t" - "vdup.32 q14, %[zero] \n\t" - "loop_mc_%=: \n\t" - "vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t" - "vdup.32 q13, d26[0] \n\t" - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vqadd.s32 q1, q1, q13 \n\t" - "vmax.s32 q0, q0, q14 \n\t" - "vmax.s32 q1, q1, q14 \n\t" - "vcvt.f32.s32 q2, q0 \n\t" - "vcvt.f32.s32 q3, q1 \n\t" - "vmul.f32 q2, q2, q15 \n\t" - "vmul.f32 q3, q3, q15 \n\t" - "vcvt.s32.f32 q4, q2 \n\t" - "vcvt.s32.f32 q5, q3 \n\t" - "vqmovn.s32 d12, q4 \n\t" - "vqmovn.s32 d13, q5 \n\t" - "vqmovn.s16 d14, q6 \n\t" - "vst1.8 {d14}, [r6]! \n\t" - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), - [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr), - [scale] "r"(scale), [zero] "r"(zero) - : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q13", "q14", "q15"); - } - - int32_t nc_left; - int32_t *c0; - int8_t *C0; - int32_t bias_v; - if (_nc1 != 0) { - for (int32_t i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 8 + i * ldc; - c0 = c_ptr + nc1 * 8 + i * NC; - bias_v = *(bias_ptr + i); - nc_left = _nc1; - asm volatile( - "vdup.32 q15, %[scale] \n\t" - "vdup.32 q14, %[zero] \n\t" - "vdup.32 q13, %[bias_v] \n\t" - "cmp %[_nc1], #4 \n\t" - "blt less_four_%= \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vmax.s32 q0, q0, q14 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vst1.8 {d8[1]}, [%[C0]]! \n\t" - "vst1.8 {d8[2]}, [%[C0]]! \n\t" - "vst1.8 {d8[3]}, [%[C0]]! \n\t" - "subs %[_nc1], %[_nc1], #4 \n\t" - "beq process_over_%= \n\t" - "less_four_%=: \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vmax.s32 q0, q0, q14 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "loop_save_%=: \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vext.8 d8, d8, d8, #1 \n\t" - "subs %[_nc1], %[_nc1], #1 \n\t" - "bgt loop_save_%= \n\t" - "process_over_%=: \n\t" - : - : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), - [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q13", "q14", "q15"); - } - } -#endif // __aarch64__ -#endif // __ARM_NEON -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/gemm_omp_int8.cpp b/mobile/src/operators/math/gemm_omp_int8.cpp deleted file mode 100644 index 2ea4520181..0000000000 --- a/mobile/src/operators/math/gemm_omp_int8.cpp +++ /dev/null @@ -1,453 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "common/log.h" -#include "memory/t_malloc.h" -#include "operators/math/gemm.h" -#if __ARM_NEON -#include -#endif -#ifdef _OPENMP -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { - -void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer) { - const int32_t j_length = n - n_tail; -#pragma omp parallel for - for (int32_t j = 0; j < j_length; j += 8) { - int8_t *local_buffer = buffer + j * k; - for (int32_t i = 0; i < k; ++i) { - const int8_t *b0 = &B(i, j); -#if __ARM_NEON -#if __aarch64__ -// PackMatrixB_omp_8c used only for aarch32 -#else - asm volatile( - // "pld [%[b0]] \n\t" - "vld1.s8 {d0}, [%[b0]] \n\t" - "vst1.s8 {d0}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer) - : [b0] "r"(b0) - : "memory", "q0"); -#endif // __aarch64__ -#else - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; - *local_buffer++ = *b0++; -#endif // __ARM_NEON - } - } - if (n_tail != 0) { - int8_t *local_buffer = buffer + j_length * k; - for (int32_t i = 0; i < k; ++i) { - const int8_t *b0 = &B(i, j_length); - for (int32_t j = j_length; j < n; ++j) { - *local_buffer++ = *b0++; - } - for (int32_t j = n; j < j_length + 8; ++j) { - *local_buffer++ = 0; - } - } - } -} - -void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, - const int8_t *A, int32_t lda, int8_t *buffer) { - const int32_t i_length = m - m_tail; -#pragma omp parallel for - for (int32_t i = 0; i < i_length; i += 4) { - const int8_t *a0 = A + i * lda; - const int8_t *a1 = A + (i + 1) * lda; - const int8_t *a2 = A + (i + 2) * lda; - const int8_t *a3 = A + (i + 3) * lda; - int8_t *local_buffer = buffer + i * k; - for (int32_t j = 0; j < k; ++j) { - *local_buffer++ = *a0++; - *local_buffer++ = *a1++; - *local_buffer++ = *a2++; - *local_buffer++ = *a3++; - } - } - - if (m_tail != 0) { - const int8_t *a0 = &A(i_length, 0); - const int8_t *a1 = a0 + lda; - const int8_t *a2 = a0 + 2 * lda; - const int8_t *a3 = a0 + 3 * lda; - int8_t *local_buffer = buffer + i_length * k; - switch (m_tail) { - case 1: - a1 = zero_int8; - case 2: - a2 = zero_int8; - case 3: - a3 = zero_int8; - break; - default: - break; - } - for (int32_t j = 0; j < k; ++j) { - *local_buffer++ = *a0++; - *local_buffer++ = *a1++; - *local_buffer++ = *a2++; - *local_buffer++ = *a3++; - } - } -} - -// 8 bits int PackMatrixA_4r -void Gemm::PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail, - const int8_t *A, int32_t lda, int8_t *buffer) { - const int32_t i_length = m - m_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; -#pragma omp parallel for - for (int32_t i = 0; i < i_length; i += 4) { - const int8_t *a0 = A + i * lda; - const int8_t *a1 = A + (i + 1) * lda; - const int8_t *a2 = A + (i + 2) * lda; - const int8_t *a3 = A + (i + 3) * lda; - int8_t *local_buffer = buffer + i * KC; - for (int32_t j = 0; j < k_count; ++j) { -#if __ARM_NEON -#if __aarch64__ - asm volatile( - "ld1 {v0.16b}, [%[a0]], #16 \n\t" - "ld1 {v1.16b}, [%[a1]], #16 \n\t" - "ld1 {v2.16b}, [%[a2]], #16 \n\t" - "ld1 {v3.16b}, [%[a3]], #16 \n\t" - "st1 {v0.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v1.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v2.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v3.16b}, [%[local_buffer]], #16 \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "v0", "v1", "v2", "v3"); -#else - asm volatile( - "vld1.s8 {d0, d1}, [%[a0]]! \n\t" - "vld1.s8 {d2, d3}, [%[a1]]! \n\t" - "vld1.s8 {d4, d5}, [%[a2]]! \n\t" - "vld1.s8 {d6, d7}, [%[a3]]! \n\t" - "vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t" - "vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t" - "vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t" - "vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#else - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a0++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a1++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a2++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a3++; - } -#endif // __ARM_NEON - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a0++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a1++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a2++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a3++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } - - if (m_tail != 0) { - const int8_t *a0 = &A(i_length, 0); - const int8_t *a1 = a0 + lda; - const int8_t *a2 = a0 + 2 * lda; - const int8_t *a3 = a0 + 3 * lda; - int8_t *local_buffer = buffer + i_length * KC; - switch (m_tail) { - case 1: - a1 = zero_int8; - case 2: - a2 = zero_int8; - case 3: - a3 = zero_int8; - break; - default: - break; - } - for (int32_t j = 0; j < k_count; ++j) { -#if __ARM_NEON -#if __aarch64__ - asm volatile( - "ld1 {v0.16b}, [%[a0]], #16 \n\t" - "ld1 {v1.16b}, [%[a1]], #16 \n\t" - "ld1 {v2.16b}, [%[a2]], #16 \n\t" - "ld1 {v3.16b}, [%[a3]], #16 \n\t" - "st1 {v0.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v1.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v2.16b}, [%[local_buffer]], #16 \n\t" - "st1 {v3.16b}, [%[local_buffer]], #16 \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "v0", "v1", "v2", "v3"); -#else - asm volatile( - "vld1.s8 {d0, d1}, [%[a0]]! \n\t" - "vld1.s8 {d2, d3}, [%[a1]]! \n\t" - "vld1.s8 {d4, d5}, [%[a2]]! \n\t" - "vld1.s8 {d6, d7}, [%[a3]]! \n\t" - "vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t" - "vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t" - "vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t" - "vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#else - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a0++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a1++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a2++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a3++; - } -#endif // __ARM_NEON - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a0++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a1++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a2++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a3++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - -// 8 bits int PackMatrixB -void Gemm::PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer) { - const int32_t j_length = n - n_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; -#pragma omp parallel for - for (int32_t j = 0; j < j_length; j += 2) { - int8_t *local_buffer = buffer + j * KC; - for (int32_t i = 0; i < k_count; ++i) { - const int8_t *b0 = &B((i << 4), j); - const int8_t *b1 = &B((i << 4), j + 1); - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b1; - b1 += ldb; - } - } - if (k_tail != 0) { - const int8_t *b0 = &B((k_count << 4), j); - const int8_t *b1 = &B((k_count << 4), j + 1); - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b1; - b1 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } - if (n_tail != 0) { - int8_t *local_buffer = buffer + j_length * KC; - for (int32_t i = 0; i < k_count; ++i) { - const int8_t *b0 = &B((i << 4), j_length); - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = 0; - } - } - if (k_tail != 0) { - const int8_t *b0 = &B((k_count << 4), j_length); - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - -// 8 bits int PackMatrixB -void Gemm::PackMatrixB_omp_4c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer) { - const int32_t j_length = n - n_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; -#pragma omp parallel for - for (int32_t j = 0; j < n; j += 4) { - int8_t *local_buffer = buffer + j * KC; - const int8_t *b0 = &B(0, j); - const int8_t *b1 = b0 + 1; - const int8_t *b2 = b0 + 2; - const int8_t *b3 = b0 + 3; - if (j > j_length) { - switch (n_tail) { - case 1: - b1 = zero_int8; - case 2: - b2 = zero_int8; - case 3: - b3 = zero_int8; - break; - default: - break; - } - } - - for (int32_t i = 0; i < k_count; ++i) { - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b1; - b1 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b2; - b2 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b3; - b3 += ldb; - } - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b1; - b1 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b2; - b2 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b3; - b3 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/gpc.cpp b/mobile/src/operators/math/gpc.cpp deleted file mode 100644 index 6b7700081a..0000000000 --- a/mobile/src/operators/math/gpc.cpp +++ /dev/null @@ -1,2142 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP - -#include "operators/math/gpc.h" - -namespace gpc { - -typedef struct lmt_shape { /* Local minima table */ - double y; /* Y coordinate at local minimum */ - edge_node *first_bound; /* Pointer to bound list */ - struct lmt_shape *next; /* Pointer to next local minimum */ -} lmt_node; - -typedef struct sbt_t_shape { /* Scanbeam tree */ - double y; /* Scanbeam node y value */ - struct sbt_t_shape *less; /* Pointer to nodes with lower y */ - struct sbt_t_shape *more; /* Pointer to nodes with higher y */ -} sb_tree; - -typedef struct it_shape { /* Intersection table */ - edge_node *ie[2]; /* Intersecting edge (bundle) pair */ - gpc_vertex point; /* Point of intersection */ - struct it_shape *next; /* The next intersection table node */ -} it_node; - -typedef struct st_shape { /* Sorted edge table */ - edge_node *edge; /* Pointer to AET edge */ - double xb; /* Scanbeam bottom x coordinate */ - double xt; /* Scanbeam top x coordinate */ - double dx; /* Change in x for a unit y increase */ - struct st_shape *prev; /* Previous edge in sorted list */ -} st_node; - -typedef struct bbox_shape { /* Contour axis-aligned bounding box */ - double xmin; /* Minimum x coordinate */ - double ymin; /* Minimum y coordinate */ - double xmax; /* Maximum x coordinate */ - double ymax; /* Maximum y coordinate */ -} bbox; - -/* -=========================================================================== - Global Data -=========================================================================== -*/ - -/* Horizontal edge state transitions within scanbeam boundary */ -const h_state next_h_state[3][6] = { - /* ABOVE BELOW CROSS */ - /* L R L R L R */ - /* NH */ - {BH, TH, TH, BH, NH, NH}, - /* BH */ - {NH, NH, NH, NH, TH, TH}, - /* TH */ - {NH, NH, NH, NH, BH, BH}}; - -/* -=========================================================================== - Private Functions -=========================================================================== -*/ - -static void reset_it(it_node **it) { - it_node *itn; - - while (*it) { - itn = (*it)->next; - gpc_free(*it); - *it = itn; - } -} - -static void reset_lmt(lmt_node **lmt) { - lmt_node *lmtn; - - while (*lmt) { - lmtn = (*lmt)->next; - gpc_free(*lmt); - *lmt = lmtn; - } -} - -static void insert_bound(edge_node **b, edge_node *e) { - edge_node *existing_bound = NULL; - - if (!*b) { - /* Link node e to the tail of the list */ - *b = e; - } else { - /* Do primary sort on the x field */ - if (e[0].bot.x < (*b)[0].bot.x) { - /* Insert a new node mid-list */ - existing_bound = *b; - *b = e; - (*b)->next_bound = existing_bound; - } else { - if (e[0].bot.x == (*b)[0].bot.x) { - /* Do secondary sort on the dx field */ - if (e[0].dx < (*b)[0].dx) { - /* Insert a new node mid-list */ - existing_bound = *b; - *b = e; - (*b)->next_bound = existing_bound; - } else { - /* Head further down the list */ - insert_bound(&((*b)->next_bound), e); - } - } else { - /* Head further down the list */ - insert_bound(&((*b)->next_bound), e); - } - } - } -} - -static edge_node **bound_list(lmt_node **lmt, double y) { - lmt_node *existing_node; - - if (!*lmt) { - /* Add node onto the tail end of the LMT */ - gpc_malloc(*lmt, sizeof(lmt_node), - const_cast("LMT insertion")); - (*lmt)->y = y; - (*lmt)->first_bound = NULL; - (*lmt)->next = NULL; - return &((*lmt)->first_bound); - } else if (y < (*lmt)->y) { - /* Insert a new LMT node before the current node */ - existing_node = *lmt; - gpc_malloc(*lmt, sizeof(lmt_node), - const_cast("LMT insertion")); - (*lmt)->y = y; - (*lmt)->first_bound = NULL; - (*lmt)->next = existing_node; - return &((*lmt)->first_bound); - } else { - if (y > (*lmt)->y) { - /* Head further up the LMT */ - return bound_list(&((*lmt)->next), y); - } else { - /* Use this existing LMT node */ - return &((*lmt)->first_bound); - } - } -} - -static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) { - if (!*sbtree) { - /* Add a new tree node here */ - gpc_malloc(*sbtree, sizeof(sb_tree), - const_cast("scanbeam tree insertion")); - (*sbtree)->y = y; - (*sbtree)->less = NULL; - (*sbtree)->more = NULL; - (*entries)++; - } else { - if ((*sbtree)->y > y) { - /* Head into the 'less' sub-tree */ - add_to_sbtree(entries, &((*sbtree)->less), y); - } else { - if ((*sbtree)->y < y) { - /* Head into the 'more' sub-tree */ - add_to_sbtree(entries, &((*sbtree)->more), y); - } - } - } -} - -static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) { - if (sbtree->less) { - build_sbt(entries, sbt, sbtree->less); - } - sbt[*entries] = sbtree->y; - (*entries)++; - if (sbtree->more) { - build_sbt(entries, sbt, sbtree->more); - } -} - -static void free_sbtree(sb_tree **sbtree) { - if (*sbtree) { - free_sbtree(&((*sbtree)->less)); - free_sbtree(&((*sbtree)->more)); - gpc_free(*sbtree); - } -} - -static int count_optimal_vertices(gpc_vertex_list c) { - int result = 0; - int i = 0; - - /* Ignore non-contributing contours */ - if (c.num_vertices > 0) { - for (i = 0; i < c.num_vertices; i++) { - /* Ignore superfluous vertices embedded in horizontal edges */ - if (gpc_optimal(c.vertex, i, c.num_vertices)) { - result++; - } - } - } - return result; -} - -static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries, - gpc_polygon *p, int type, gpc_op op) { - int c = 0; - int i = 0; - int min = 0; - int max = 0; - int num_edges = 0; - int v = 0; - int num_vertices = 0; - int total_vertices = 0; - int e_index = 0; - edge_node *e = NULL; - edge_node *edge_table = NULL; - - for (c = 0; c < p->num_contours; c++) { - total_vertices += count_optimal_vertices(p->contour[c]); - } - - /* Create the entire input polygon edge table in one go */ - gpc_malloc(edge_table, total_vertices * sizeof(edge_node), - const_cast("edge table creation")); - - for (c = 0; c < p->num_contours; c++) { - if (p->contour[c].num_vertices < 0) { - /* Ignore the non-contributing contour and repair the vertex count */ - p->contour[c].num_vertices = -p->contour[c].num_vertices; - } else { - /* Perform contour optimisation */ - num_vertices = 0; - for (i = 0; i < p->contour[c].num_vertices; i++) { - if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) { - edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x; - edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y; - - /* Record vertex in the scanbeam table */ - add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y); - - num_vertices++; - } - } - - /* Do the contour forward pass */ - for (min = 0; min < num_vertices; min++) { - /* If a forward local minimum... */ - if (gpc_fwd_min(edge_table, min, num_vertices)) { - /* Search for the next local maximum... */ - num_edges = 1; - max = gpc_next_index(min, num_vertices); - while (gpc_not_fmax(edge_table, max, num_vertices)) { - num_edges++; - max = gpc_next_index(max, num_vertices); - } - - /* Build the next edge list */ - e = &edge_table[e_index]; - e_index += num_edges; - v = min; - e[0].bstate[BELOW] = UNBUNDLED; - e[0].bundle[BELOW][CLIP] = 0; - e[0].bundle[BELOW][SUBJ] = 0; - for (i = 0; i < num_edges; i++) { - e[i].xb = edge_table[v].vertex.x; - e[i].bot.x = edge_table[v].vertex.x; - e[i].bot.y = edge_table[v].vertex.y; - - v = gpc_next_index(v, num_vertices); - - e[i].top.x = edge_table[v].vertex.x; - e[i].top.y = edge_table[v].vertex.y; - e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) / - (e[i].top.y - e[i].bot.y); - e[i].type = type; - e[i].outp[ABOVE] = NULL; - e[i].outp[BELOW] = NULL; - e[i].next = NULL; - e[i].prev = NULL; - e[i].succ = - ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL; - e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL; - e[i].next_bound = NULL; - e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT; - e[i].bside[SUBJ] = LEFT; - } - insert_bound(bound_list(lmt, edge_table[min].vertex.y), e); - } - } - - /* Do the contour reverse pass */ - for (min = 0; min < num_vertices; min++) { - /* If a reverse local minimum... */ - if (gpc_rev_min(edge_table, min, num_vertices)) { - /* Search for the previous local maximum... */ - num_edges = 1; - max = gpc_prev_index(min, num_vertices); - while (gpc_not_rmax(edge_table, max, num_vertices)) { - num_edges++; - max = gpc_prev_index(max, num_vertices); - } - - /* Build the previous edge list */ - e = &edge_table[e_index]; - e_index += num_edges; - v = min; - e[0].bstate[BELOW] = UNBUNDLED; - e[0].bundle[BELOW][CLIP] = 0; - e[0].bundle[BELOW][SUBJ] = 0; - for (i = 0; i < num_edges; i++) { - e[i].xb = edge_table[v].vertex.x; - e[i].bot.x = edge_table[v].vertex.x; - e[i].bot.y = edge_table[v].vertex.y; - - v = gpc_prev_index(v, num_vertices); - - e[i].top.x = edge_table[v].vertex.x; - e[i].top.y = edge_table[v].vertex.y; - e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) / - (e[i].top.y - e[i].bot.y); - e[i].type = type; - e[i].outp[ABOVE] = NULL; - e[i].outp[BELOW] = NULL; - e[i].next = NULL; - e[i].prev = NULL; - e[i].succ = - ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL; - e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL; - e[i].next_bound = NULL; - e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT; - e[i].bside[SUBJ] = LEFT; - } - insert_bound(bound_list(lmt, edge_table[min].vertex.y), e); - } - } - } - } - return edge_table; -} // NOLINT - -static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) { - if (!*aet) { - /* Append edge onto the tail end of the AET */ - *aet = edge; - edge->prev = prev; - edge->next = NULL; - } else { - /* Do primary sort on the xb field */ - if (edge->xb < (*aet)->xb) { - /* Insert edge here (before the AET edge) */ - edge->prev = prev; - edge->next = *aet; - (*aet)->prev = edge; - *aet = edge; - } else { - if (edge->xb == (*aet)->xb) { - /* Do secondary sort on the dx field */ - if (edge->dx < (*aet)->dx) { - /* Insert edge here (before the AET edge) */ - edge->prev = prev; - edge->next = *aet; - (*aet)->prev = edge; - *aet = edge; - } else { - /* Head further into the AET */ - add_edge_to_aet(&((*aet)->next), edge, *aet); - } - } else { - /* Head further into the AET */ - add_edge_to_aet(&((*aet)->next), edge, *aet); - } - } - } -} - -static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1, - double x, double y) { - it_node *existing_node; - - if (!*it) { - /* Append a new node to the tail of the list */ - gpc_malloc(*it, sizeof(it_node), - const_cast("IT insertion")); - (*it)->ie[0] = edge0; - (*it)->ie[1] = edge1; - (*it)->point.x = x; - (*it)->point.y = y; - (*it)->next = NULL; - } else { - if ((*it)->point.y > y) { - /* Insert a new node mid-list */ - existing_node = *it; - gpc_malloc(*it, sizeof(it_node), - const_cast("IT insertion")); - (*it)->ie[0] = edge0; - (*it)->ie[1] = edge1; - (*it)->point.x = x; - (*it)->point.y = y; - (*it)->next = existing_node; - } else { - /* Head further down the list */ - add_intersection(&((*it)->next), edge0, edge1, x, y); - } - } -} - -static void add_st_edge(st_node **st, it_node **it, edge_node *edge, - double dy) { - st_node *existing_node; - double den = 0.0; - double r = 0.0; - double x = 0.0; - double y = 0.0; - - if (!*st) { - /* Append edge onto the tail end of the ST */ - gpc_malloc(*st, sizeof(st_node), - const_cast("ST insertion")); - (*st)->edge = edge; - (*st)->xb = edge->xb; - (*st)->xt = edge->xt; - (*st)->dx = edge->dx; - (*st)->prev = NULL; - } else { - den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb); - - /* If new edge and ST edge don't cross */ - if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) || - (fabs(den) <= DBL_EPSILON)) { - /* No intersection - insert edge here (before the ST edge) */ - existing_node = *st; - gpc_malloc(*st, sizeof(st_node), - const_cast("ST insertion")); - (*st)->edge = edge; - (*st)->xb = edge->xb; - (*st)->xt = edge->xt; - (*st)->dx = edge->dx; - (*st)->prev = existing_node; - } else { - /* Compute intersection between new edge and ST edge */ - r = (edge->xb - (*st)->xb) / den; - x = (*st)->xb + r * ((*st)->xt - (*st)->xb); - y = r * dy; - - /* Insert the edge pointers and the intersection point in the IT */ - add_intersection(it, (*st)->edge, edge, x, y); - - /* Head further into the ST */ - add_st_edge(&((*st)->prev), it, edge, dy); - } - } -} - -static void build_intersection_table(it_node **it, edge_node *aet, double dy) { - st_node *st; - st_node *stp; - edge_node *edge = NULL; - - /* Build intersection table for the current scanbeam */ - reset_it(it); - st = NULL; - - /* Process each AET edge */ - for (edge = aet; edge; edge = edge->next) { - if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] || - edge->bundle[ABOVE][SUBJ]) { - add_st_edge(&st, it, edge, dy); - } - } - - /* Free the sorted edge table */ - while (st) { - stp = st->prev; - gpc_free(st); - st = stp; - } -} - -static int count_contours(polygon_node *polygon) { - int nc = 0; - int nv = 0; - vertex_node *v = NULL; - vertex_node *nextv = NULL; - - for (nc = 0; polygon; polygon = polygon->next) { - if (polygon->active) { - /* Count the vertices in the current contour */ - nv = 0; - for (v = polygon->proxy->v[LEFT]; v; v = v->next) { - nv++; - } - - /* Record valid vertex counts in the active field */ - if (nv > 2) { - polygon->active = nv; - nc++; - } else { - /* Invalid contour: just free the heap */ - for (v = polygon->proxy->v[LEFT]; v; v = nextv) { - nextv = v->next; - gpc_free(v); - } - polygon->active = 0; - } - } - } - return nc; -} - -static void add_left(polygon_node *p, double x, double y) { - vertex_node *nv = NULL; - - /* Create a new vertex node and set its fields */ - gpc_malloc(nv, sizeof(vertex_node), - const_cast("vertex node creation")); - nv->x = x; - nv->y = y; - - /* Add vertex nv to the left end of the polygon's vertex list */ - nv->next = p->proxy->v[LEFT]; - - /* Update proxy->[LEFT] to point to nv */ - p->proxy->v[LEFT] = nv; -} - -static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) { - polygon_node *target = NULL; - - /* Label contour as a hole */ - q->proxy->hole = 1; - - if (p->proxy != q->proxy) { - /* Assign p's vertex list to the left end of q's list */ - p->proxy->v[RIGHT]->next = q->proxy->v[LEFT]; - q->proxy->v[LEFT] = p->proxy->v[LEFT]; - - /* Redirect any p->proxy references to q->proxy */ - - for (target = p->proxy; list; list = list->next) { - if (list->proxy == target) { - list->active = 0; - list->proxy = q->proxy; - } - } - } -} - -static void add_right(polygon_node *p, double x, double y) { - vertex_node *nv = NULL; - - /* Create a new vertex node and set its fields */ - gpc_malloc(nv, sizeof(vertex_node), - const_cast("vertex node creation")); - nv->x = x; - nv->y = y; - nv->next = NULL; - - /* Add vertex nv to the right end of the polygon's vertex list */ - p->proxy->v[RIGHT]->next = nv; - - /* Update proxy->v[RIGHT] to point to nv */ - p->proxy->v[RIGHT] = nv; -} - -static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) { - polygon_node *target = NULL; - - /* Label contour as external */ - q->proxy->hole = 0; - - if (p->proxy != q->proxy) { - /* Assign p's vertex list to the right end of q's list */ - q->proxy->v[RIGHT]->next = p->proxy->v[LEFT]; - q->proxy->v[RIGHT] = p->proxy->v[RIGHT]; - - /* Redirect any p->proxy references to q->proxy */ - for (target = p->proxy; list; list = list->next) { - if (list->proxy == target) { - list->active = 0; - list->proxy = q->proxy; - } - } - } -} - -static void add_local_min(polygon_node **p, edge_node *edge, double x, - double y) { - polygon_node *existing_min = NULL; - vertex_node *nv = NULL; - - existing_min = *p; - - gpc_malloc(*p, sizeof(polygon_node), - const_cast("polygon node creation")); - - /* Create a new vertex node and set its fields */ - gpc_malloc(nv, sizeof(vertex_node), - const_cast("vertex node creation")); - nv->x = x; - nv->y = y; - nv->next = NULL; - - /* Initialise proxy to point to p itself */ - (*p)->proxy = (*p); - (*p)->active = 1; - (*p)->next = existing_min; - - /* Make v[LEFT] and v[RIGHT] point to new vertex nv */ - (*p)->v[LEFT] = nv; - (*p)->v[RIGHT] = nv; - - /* Assign polygon p to the edge */ - edge->outp[ABOVE] = *p; -} - -static int count_tristrips(polygon_node *tn) { - int total = 0; - - for (total = 0; tn; tn = tn->next) { - if (tn->active > 2) { - total++; - } - } - return total; -} - -void add_vertex(vertex_node **t, double x, double y) { - if (!(*t)) { - gpc_malloc(*t, sizeof(vertex_node), - const_cast("tristrip vertex creation")); - (*t)->x = x; - (*t)->y = y; - (*t)->next = NULL; - } else { - /* Head further down the list */ - add_vertex(&((*t)->next), x, y); - } -} - -void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) { - add_vertex(&(e->outp[p]->v[s]), x, y); - e->outp[p]->active++; -} - -static void new_tristrip(polygon_node **tn, edge_node *edge, double x, - double y) { - if (!(*tn)) { - gpc_malloc(*tn, sizeof(polygon_node), - const_cast("tristrip node creation")); - (*tn)->next = NULL; - (*tn)->v[LEFT] = NULL; - (*tn)->v[RIGHT] = NULL; - (*tn)->active = 1; - add_vertex(&((*tn)->v[LEFT]), x, y); - edge->outp[ABOVE] = *tn; - } else { - /* Head further down the list */ - new_tristrip(&((*tn)->next), edge, x, y); - } -} - -static bbox *create_contour_bboxes(gpc_polygon *p) { - bbox *box; - int c = 0; - int v = 0; - - gpc_malloc(box, p->num_contours * sizeof(bbox), - const_cast("Bounding box creation")); - - /* Construct contour bounding boxes */ - for (c = 0; c < p->num_contours; c++) { - /* Initialise bounding box extent */ - box[c].xmin = DBL_MAX; - box[c].ymin = DBL_MAX; - box[c].xmax = -DBL_MAX; - box[c].ymax = -DBL_MAX; - - for (v = 0; v < p->contour[c].num_vertices; v++) { - /* Adjust bounding box */ - if (p->contour[c].vertex[v].x < box[c].xmin) { - box[c].xmin = p->contour[c].vertex[v].x; - } - if (p->contour[c].vertex[v].y < box[c].ymin) { - box[c].ymin = p->contour[c].vertex[v].y; - } - if (p->contour[c].vertex[v].x > box[c].xmax) { - box[c].xmax = p->contour[c].vertex[v].x; - } - if (p->contour[c].vertex[v].y > box[c].ymax) { - box[c].ymax = p->contour[c].vertex[v].y; - } - } - } - return box; -} - -static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) { - bbox *s_bbox; - bbox *c_bbox; - int s = 0; - int c = 0; - int *o_table = NULL; - int overlap = 0; - - s_bbox = create_contour_bboxes(subj); - c_bbox = create_contour_bboxes(clip); - - gpc_malloc(o_table, - subj->num_contours * clip->num_contours * sizeof(int), - const_cast("overlap table creation")); - - /* Check all subject contour bounding boxes against clip boxes */ - for (s = 0; s < subj->num_contours; s++) { - for (c = 0; c < clip->num_contours; c++) { - o_table[c * subj->num_contours + s] = - (!((s_bbox[s].xmax < c_bbox[c].xmin) || - (s_bbox[s].xmin > c_bbox[c].xmax))) && - (!((s_bbox[s].ymax < c_bbox[c].ymin) || - (s_bbox[s].ymin > c_bbox[c].ymax))); - } - } - - /* For each clip contour, search for any subject contour overlaps */ - for (c = 0; c < clip->num_contours; c++) { - overlap = 0; - for (s = 0; (!overlap) && (s < subj->num_contours); s++) { - overlap = o_table[c * subj->num_contours + s]; - } - - if (!overlap) { - /* Flag non contributing status by negating vertex count */ - clip->contour[c].num_vertices = -clip->contour[c].num_vertices; - } - } - - if (op == GPC_INT) { - /* For each subject contour, search for any clip contour overlaps */ - for (s = 0; s < subj->num_contours; s++) { - overlap = 0; - for (c = 0; (!overlap) && (c < clip->num_contours); c++) { - overlap = o_table[c * subj->num_contours + s]; - } - - if (!overlap) { - /* Flag non contributing status by negating vertex count */ - subj->contour[s].num_vertices = -subj->contour[s].num_vertices; - } - } - } - - gpc_free(s_bbox); - gpc_free(c_bbox); - gpc_free(o_table); -} - -/* -=========================================================================== - Public Functions -=========================================================================== -*/ - -void gpc_free_polygon(gpc_polygon *p) { - int c = 0; - - for (c = 0; c < p->num_contours; c++) { - gpc_free(p->contour[c].vertex); - } - gpc_free(p->hole); - gpc_free(p->contour); - p->num_contours = 0; -} - -void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) { - int *extended_hole = NULL; - int c = 0; - int v = 0; - gpc_vertex_list *extended_contour = NULL; - - /* Create an extended hole array */ - gpc_malloc(extended_hole, (p->num_contours + 1) * sizeof(int), - const_cast("contour hole addition")); - - /* Create an extended contour array */ - gpc_malloc(extended_contour, - (p->num_contours + 1) * sizeof(gpc_vertex_list), - const_cast("contour addition")); - - /* Copy the old contour and hole data into the extended arrays */ - for (c = 0; c < p->num_contours; c++) { - extended_hole[c] = p->hole[c]; - extended_contour[c] = p->contour[c]; - } - - /* Copy the new contour and hole onto the end of the extended arrays */ - c = p->num_contours; - extended_hole[c] = hole; - extended_contour[c].num_vertices = new_contour->num_vertices; - gpc_malloc(extended_contour[c].vertex, - new_contour->num_vertices * sizeof(gpc_vertex), - const_cast("contour addition")); - for (v = 0; v < new_contour->num_vertices; v++) { - extended_contour[c].vertex[v] = new_contour->vertex[v]; - } - - /* Dispose of the old contour */ - gpc_free(p->contour); - gpc_free(p->hole); - - /* Update the polygon information */ - p->num_contours++; - p->hole = extended_hole; - p->contour = extended_contour; -} - -// gpc_polygon_clip -void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, - gpc_polygon *result) { - sb_tree *sbtree = NULL; - it_node *it = NULL; - it_node *intersect = NULL; - edge_node *edge = NULL; - edge_node *prev_edge = NULL; - edge_node *next_edge = NULL; - edge_node *succ_edge = NULL; - edge_node *e0 = NULL; - edge_node *e1 = NULL; - edge_node *aet = NULL; - edge_node *c_heap = NULL; - edge_node *s_heap = NULL; - lmt_node *lmt = NULL; - lmt_node *local_min = NULL; - polygon_node *out_poly = NULL; - polygon_node *p = NULL; - polygon_node *q = NULL; - polygon_node *poly = NULL; - polygon_node *npoly = NULL; - polygon_node *cf = NULL; - vertex_node *vtx = NULL; - vertex_node *nv = NULL; - h_state horiz[2]; - int in[2]; - int exists[2]; - int parity[2] = {LEFT, LEFT}; - int c = 0; - int v = 0; - int contributing = 0; - int search = 0; - int scanbeam = 0; - int sbt_entries = 0; - int vclass = 0; - int bl = 0; - int br = 0; - int tl = 0; - int tr = 0; - double *sbt = NULL; - double xb = 0.0; - double px = 0.0; - double yb = 0.0; - double yt = 0.0; - double dy = 0.0; - double ix = 0.0; - double iy = 0.0; - - /* Test for trivial NULL result cases */ - if (((subj->num_contours == 0) && (clip->num_contours == 0)) || - ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) || - ((clip->num_contours == 0) && (op == GPC_INT))) { - result->num_contours = 0; - result->hole = NULL; - result->contour = NULL; - return; - } - /* Identify potentialy contributing contours */ - if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) && - (clip->num_contours > 0)) { - minimax_test(subj, clip, op); - } - /* Build LMT */ - if (subj->num_contours > 0) { - s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op); - } - if (clip->num_contours > 0) { - c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op); - } - /* Return a NULL result if no contours contribute */ - if (lmt == NULL) { - result->num_contours = 0; - result->hole = NULL; - result->contour = NULL; - reset_lmt(&lmt); - gpc_free(s_heap); - gpc_free(c_heap); - return; - } - - /* Build scanbeam table from scanbeam tree */ - gpc_malloc(sbt, sbt_entries * sizeof(double), - const_cast("sbt creation")); - build_sbt(&scanbeam, sbt, sbtree); - scanbeam = 0; - free_sbtree(&sbtree); - /* Allow pointer re-use without causing memory leak */ - if (subj == result) { - gpc_free_polygon(subj); - } - if (clip == result) { - gpc_free_polygon(clip); - } - /* Invert clip polygon for difference operation */ - if (op == GPC_DIFF) { - parity[CLIP] = RIGHT; - } - local_min = lmt; - - // Process each scanbeam - while (scanbeam < sbt_entries) { - /* Set yb and yt to the bottom and top of the scanbeam */ - yb = sbt[scanbeam++]; - if (scanbeam < sbt_entries) { - yt = sbt[scanbeam]; - dy = yt - yb; - } - /* === SCANBEAM BOUNDARY PROCESSING ================================ */ - /* If LMT node corresponding to yb exists */ - if (local_min) { - if (local_min->y == yb) { - /* Add edges starting at this local minimum to the AET */ - for (edge = local_min->first_bound; edge; edge = edge->next_bound) { - add_edge_to_aet(&aet, edge, NULL); - } - local_min = local_min->next; - } - } - /* Set dummy previous x value */ - px = -DBL_MAX; - /* Create bundles within AET */ - e0 = aet; - e1 = aet; - /* Set up bundle fields of first edge */ - aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); - aet->bundle[ABOVE][!aet->type] = 0; - aet->bstate[ABOVE] = UNBUNDLED; - - for (next_edge = aet->next; next_edge; next_edge = next_edge->next) { - /* Set up bundle fields of next edge */ - next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb); - next_edge->bundle[ABOVE][!next_edge->type] = 0; - next_edge->bstate[ABOVE] = UNBUNDLED; - /* Bundle edges above the scanbeam boundary if they coincide */ - if (next_edge->bundle[ABOVE][next_edge->type]) { - if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) && - (e0->top.y != yb)) { - next_edge->bundle[ABOVE][next_edge->type] ^= - e0->bundle[ABOVE][next_edge->type]; - next_edge->bundle[ABOVE][!next_edge->type] = - e0->bundle[ABOVE][!next_edge->type]; - next_edge->bstate[ABOVE] = BUNDLE_HEAD; - e0->bundle[ABOVE][CLIP] = 0; - e0->bundle[ABOVE][SUBJ] = 0; - e0->bstate[ABOVE] = BUNDLE_TAIL; - } - e0 = next_edge; - } - } - horiz[CLIP] = NH; - horiz[SUBJ] = NH; - - // Process each edge at this scanbeam boundary - for (edge = aet; edge; edge = edge->next) { - exists[CLIP] = - edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1); - exists[SUBJ] = - edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1); - if (exists[CLIP] || exists[SUBJ]) { - /* Set bundle side */ - edge->bside[CLIP] = parity[CLIP]; - edge->bside[SUBJ] = parity[SUBJ]; - /* Determine contributing status and quadrant occupancies */ - switch (op) { - case GPC_DIFF: - case GPC_INT: - contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) || - (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) || - (exists[CLIP] && exists[SUBJ] && - (parity[CLIP] == parity[SUBJ])); - br = (parity[CLIP]) && (parity[SUBJ]); - bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) && - (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); - tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) && - (parity[SUBJ] ^ (horiz[SUBJ] != NH)); - tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ - edge->bundle[BELOW][CLIP]) && - (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ - edge->bundle[BELOW][SUBJ]); - break; - case GPC_XOR: - contributing = exists[CLIP] || exists[SUBJ]; - br = (parity[CLIP]) ^ (parity[SUBJ]); - bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^ - (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); - tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^ - (parity[SUBJ] ^ (horiz[SUBJ] != NH)); - tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ - edge->bundle[BELOW][CLIP]) ^ - (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ - edge->bundle[BELOW][SUBJ]); - break; - case GPC_UNION: - contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) || - (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) || - (exists[CLIP] && exists[SUBJ] && - (parity[CLIP] == parity[SUBJ])); - br = (parity[CLIP]) || (parity[SUBJ]); - bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) || - (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); - tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) || - (parity[SUBJ] ^ (horiz[SUBJ] != NH)); - tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ - edge->bundle[BELOW][CLIP]) || - (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ - edge->bundle[BELOW][SUBJ]); - break; - } - // Update parity - parity[CLIP] ^= edge->bundle[ABOVE][CLIP]; - parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ]; - /* Update horizontal state */ - if (exists[CLIP]) { - horiz[CLIP] = next_h_state[horiz[CLIP]] - [((exists[CLIP] - 1) << 1) + parity[CLIP]]; - } - if (exists[SUBJ]) { - horiz[SUBJ] = next_h_state[horiz[SUBJ]] - [((exists[SUBJ] - 1) << 1) + parity[SUBJ]]; - } - vclass = tr + (tl << 1) + (br << 2) + (bl << 3); - if (contributing) { - xb = edge->xb; - switch (vclass) { - case EMN: - case IMN: - add_local_min(&out_poly, edge, xb, yb); - px = xb; - cf = edge->outp[ABOVE]; - break; - case ERI: - if (xb != px) { - add_right(cf, xb, yb); - px = xb; - } - edge->outp[ABOVE] = cf; - cf = NULL; - break; - case ELI: - add_left(edge->outp[BELOW], xb, yb); - px = xb; - cf = edge->outp[BELOW]; - break; - case EMX: - if (xb != px) { - add_left(cf, xb, yb); - px = xb; - } - merge_right(cf, edge->outp[BELOW], out_poly); - cf = NULL; - break; - case ILI: - if (xb != px) { - add_left(cf, xb, yb); - px = xb; - } - edge->outp[ABOVE] = cf; - cf = NULL; - break; - case IRI: - add_right(edge->outp[BELOW], xb, yb); - px = xb; - cf = edge->outp[BELOW]; - edge->outp[BELOW] = NULL; - break; - case IMX: - if (xb != px) { - add_right(cf, xb, yb); - px = xb; - } - merge_left(cf, edge->outp[BELOW], out_poly); - cf = NULL; - edge->outp[BELOW] = NULL; - break; - case IMM: - if (xb != px) { - add_right(cf, xb, yb); - px = xb; - } - merge_left(cf, edge->outp[BELOW], out_poly); - edge->outp[BELOW] = NULL; - add_local_min(&out_poly, edge, xb, yb); - cf = edge->outp[ABOVE]; - break; - case EMM: - if (xb != px) { - add_left(cf, xb, yb); - px = xb; - } - merge_right(cf, edge->outp[BELOW], out_poly); - edge->outp[BELOW] = NULL; - add_local_min(&out_poly, edge, xb, yb); - cf = edge->outp[ABOVE]; - break; - case LED: - if (edge->bot.y == yb) { - add_left(edge->outp[BELOW], xb, yb); - } - edge->outp[ABOVE] = edge->outp[BELOW]; - px = xb; - break; - case RED: - if (edge->bot.y == yb) { - add_right(edge->outp[BELOW], xb, yb); - } - edge->outp[ABOVE] = edge->outp[BELOW]; - px = xb; - break; - default: - break; - } /* End of switch */ - } /* End of contributing conditional */ - } /* End of edge exists conditional */ - } // End of AET loop - - /* Delete terminating edges from the AET, otherwise compute xt */ - for (edge = aet; edge; edge = edge->next) { - if (edge->top.y == yb) { - prev_edge = edge->prev; - next_edge = edge->next; - if (prev_edge) { - prev_edge->next = next_edge; - } else { - aet = next_edge; - } - if (next_edge) { - next_edge->prev = prev_edge; - } - /* Copy bundle head state to the adjacent tail edge if required */ - if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) { - if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) { - prev_edge->outp[BELOW] = edge->outp[BELOW]; - prev_edge->bstate[BELOW] = UNBUNDLED; - if (prev_edge->prev) { - if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) { - prev_edge->bstate[BELOW] = BUNDLE_HEAD; - } - } - } - } - } else { - if (edge->top.y == yt) { - edge->xt = edge->top.x; - } else { - edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y); - } - } - } - - if (scanbeam < sbt_entries) { - /* === SCANBEAM INTERIOR PROCESSING ============================== */ - build_intersection_table(&it, aet, dy); - /* Process each node in the intersection table */ - for (intersect = it; intersect; intersect = intersect->next) { - e0 = intersect->ie[0]; - e1 = intersect->ie[1]; - /* Only generate output for contributing intersections */ - if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) && - (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) { - p = e0->outp[ABOVE]; - q = e1->outp[ABOVE]; - ix = intersect->point.x; - iy = intersect->point.y + yb; - - in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) || - (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) || - (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] && - e0->bside[CLIP] && e1->bside[CLIP]); - in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) || - (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) || - (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] && - e0->bside[SUBJ] && e1->bside[SUBJ]); - - // Determine quadrant occupancies - switch (op) { - case GPC_DIFF: - case GPC_INT: - tr = (in[CLIP]) && (in[SUBJ]); - tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) && - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); - br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) && - (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); - bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ - e0->bundle[ABOVE][CLIP]) && - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ - e0->bundle[ABOVE][SUBJ]); - break; - case GPC_XOR: - tr = (in[CLIP]) ^ (in[SUBJ]); - tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^ - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); - br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^ - (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); - bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ - e0->bundle[ABOVE][CLIP]) ^ - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ - e0->bundle[ABOVE][SUBJ]); - break; - case GPC_UNION: - tr = (in[CLIP]) || (in[SUBJ]); - tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) || - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); - br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) || - (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); - bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ - e0->bundle[ABOVE][CLIP]) || - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ - e0->bundle[ABOVE][SUBJ]); - break; - } - vclass = tr + (tl << 1) + (br << 2) + (bl << 3); - switch (vclass) { - case EMN: - add_local_min(&out_poly, e0, ix, iy); - e1->outp[ABOVE] = e0->outp[ABOVE]; - break; - case ERI: - if (p) { - add_right(p, ix, iy); - e1->outp[ABOVE] = p; - e0->outp[ABOVE] = NULL; - } - break; - case ELI: - if (q) { - add_left(q, ix, iy); - e0->outp[ABOVE] = q; - e1->outp[ABOVE] = NULL; - } - break; - case EMX: - if (p && q) { - add_left(p, ix, iy); - merge_right(p, q, out_poly); - e0->outp[ABOVE] = NULL; - e1->outp[ABOVE] = NULL; - } - break; - case IMN: - add_local_min(&out_poly, e0, ix, iy); - e1->outp[ABOVE] = e0->outp[ABOVE]; - break; - case ILI: - if (p) { - add_left(p, ix, iy); - e1->outp[ABOVE] = p; - e0->outp[ABOVE] = NULL; - } - break; - case IRI: - if (q) { - add_right(q, ix, iy); - e0->outp[ABOVE] = q; - e1->outp[ABOVE] = NULL; - } - break; - case IMX: - if (p && q) { - add_right(p, ix, iy); - merge_left(p, q, out_poly); - e0->outp[ABOVE] = NULL; - e1->outp[ABOVE] = NULL; - } - break; - case IMM: - if (p && q) { - add_right(p, ix, iy); - merge_left(p, q, out_poly); - add_local_min(&out_poly, e0, ix, iy); - e1->outp[ABOVE] = e0->outp[ABOVE]; - } - break; - case EMM: - if (p && q) { - add_left(p, ix, iy); - merge_right(p, q, out_poly); - add_local_min(&out_poly, e0, ix, iy); - e1->outp[ABOVE] = e0->outp[ABOVE]; - } - break; - default: - break; - } // End of switch - } /* End of contributing intersection conditional */ - - /* Swap bundle sides in response to edge crossing */ - if (e0->bundle[ABOVE][CLIP]) { - e1->bside[CLIP] = !e1->bside[CLIP]; - } - if (e1->bundle[ABOVE][CLIP]) { - e0->bside[CLIP] = !e0->bside[CLIP]; - } - if (e0->bundle[ABOVE][SUBJ]) { - e1->bside[SUBJ] = !e1->bside[SUBJ]; - } - if (e1->bundle[ABOVE][SUBJ]) { - e0->bside[SUBJ] = !e0->bside[SUBJ]; - } - - /* Swap e0 and e1 bundles in the AET */ - prev_edge = e0->prev; - next_edge = e1->next; - if (next_edge) { - next_edge->prev = e0; - } - if (e0->bstate[ABOVE] == BUNDLE_HEAD) { - search = 1; - while (search) { - prev_edge = prev_edge->prev; - if (prev_edge) { - if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) { - search = 0; - } - } else { - search = 0; - } - } - } - if (!prev_edge) { - aet->prev = e1; - e1->next = aet; - aet = e0->next; - } else { - prev_edge->next->prev = e1; - e1->next = prev_edge->next; - prev_edge->next = e0->next; - } - e0->next->prev = prev_edge; - e1->next->prev = e1; - e0->next = next_edge; - } /* End of IT loop*/ - - // Prepare for next scanbeam - for (edge = aet; edge; edge = next_edge) { - next_edge = edge->next; - succ_edge = edge->succ; - if ((edge->top.y == yt) && succ_edge) { - /* Replace AET edge by its successor */ - succ_edge->outp[BELOW] = edge->outp[ABOVE]; - succ_edge->bstate[BELOW] = edge->bstate[ABOVE]; - succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; - succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; - prev_edge = edge->prev; - if (prev_edge) { - prev_edge->next = succ_edge; - } else { - aet = succ_edge; - } - if (next_edge) { - next_edge->prev = succ_edge; - } - succ_edge->prev = prev_edge; - succ_edge->next = next_edge; - } else { - /* Update this edge */ - edge->outp[BELOW] = edge->outp[ABOVE]; - edge->bstate[BELOW] = edge->bstate[ABOVE]; - edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; - edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; - edge->xb = edge->xt; - } - edge->outp[ABOVE] = NULL; - } - } - } /* === END OF SCANBEAM PROCESSING ================================== */ - // Generate result polygon from out_poly - result->contour = NULL; - result->hole = NULL; - result->num_contours = count_contours(out_poly); - if (result->num_contours > 0) { - gpc_malloc(result->hole, result->num_contours * sizeof(int), - const_cast("hole flag table creation")); - gpc_malloc(result->contour, - result->num_contours * sizeof(gpc_vertex_list), - const_cast("contour creation")); - - c = 0; - for (poly = out_poly; poly; poly = npoly) { - npoly = poly->next; - if (poly->active) { - result->hole[c] = poly->proxy->hole; - result->contour[c].num_vertices = poly->active; - gpc_malloc( - result->contour[c].vertex, - result->contour[c].num_vertices * sizeof(gpc_vertex), - const_cast("vertex creation")); - - v = result->contour[c].num_vertices - 1; - for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) { - nv = vtx->next; - result->contour[c].vertex[v].x = vtx->x; - result->contour[c].vertex[v].y = vtx->y; - gpc_free(vtx); - v--; - } - c++; - } - gpc_free(poly); - } - } else { - for (poly = out_poly; poly; poly = npoly) { - npoly = poly->next; - gpc_free(poly); - } - } - - // Tidy up - reset_it(&it); - reset_lmt(&lmt); - gpc_free(c_heap); - gpc_free(s_heap); - gpc_free(sbt); -} // NOLINT - -void gpc_free_tristrip(gpc_tristrip *t) { - int s = 0; - for (s = 0; s < t->num_strips; s++) { - gpc_free(t->strip[s].vertex); - } - gpc_free(t->strip); - t->num_strips = 0; -} - -void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) { - gpc_polygon c; - c.num_contours = 0; - c.hole = NULL; - c.contour = NULL; - gpc_tristrip_clip(GPC_DIFF, s, &c, t); -} - -// gpc_tristrip_clip -void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, - gpc_tristrip *result) { - sb_tree *sbtree = NULL; - it_node *it = NULL; - it_node *intersect = NULL; - edge_node *edge = NULL; - edge_node *prev_edge = NULL; - edge_node *next_edge = NULL; - edge_node *succ_edge = NULL; - edge_node *e0 = NULL; - edge_node *e1 = NULL; - edge_node *aet = NULL; - edge_node *c_heap = NULL; - edge_node *s_heap = NULL; - edge_node *cf = NULL; - lmt_node *lmt = NULL; - lmt_node *local_min = NULL; - polygon_node *tlist = NULL; - polygon_node *tn = NULL; - polygon_node *tnn = NULL; - polygon_node *p = NULL; - polygon_node *q = NULL; - vertex_node *lt = NULL; - vertex_node *ltn = NULL; - vertex_node *rt = NULL; - vertex_node *rtn = NULL; - h_state horiz[2]; - vertex_type cft = NUL; - int in[2]; - int exists[2]; - int parity[2] = {LEFT, LEFT}; - int s = 0; - int v = 0; - int contributing = 0; - int search = 0; - int scanbeam = 0; - int sbt_entries = 0; - int vclass = 0; - int bl = 0; - int br = 0; - int tl = 0; - int tr = 0; - double *sbt = NULL; - double xb = 0.0; - double px = 0.0; - double nx = 0.0; - double yb = 0.0; - double yt = 0.0; - double dy = 0.0; - double ix = 0.0; - double iy = 0.0; - - /* Test for trivial NULL result cases */ - if (((subj->num_contours == 0) && (clip->num_contours == 0)) || - ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) || - ((clip->num_contours == 0) && (op == GPC_INT))) { - result->num_strips = 0; - result->strip = NULL; - return; - } - - /* Identify potentialy contributing contours */ - if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) && - (clip->num_contours > 0)) { - minimax_test(subj, clip, op); - } - /* Build LMT */ - if (subj->num_contours > 0) { - s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op); - } - if (clip->num_contours > 0) { - c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op); - } - /* Return a NULL result if no contours contribute */ - if (lmt == NULL) { - result->num_strips = 0; - result->strip = NULL; - reset_lmt(&lmt); - gpc_free(s_heap); - gpc_free(c_heap); - return; - } - - /* Build scanbeam table from scanbeam tree */ - gpc_malloc(sbt, sbt_entries * sizeof(double), - const_cast("sbt creation")); - build_sbt(&scanbeam, sbt, sbtree); - scanbeam = 0; - free_sbtree(&sbtree); - - /* Invert clip polygon for difference operation */ - if (op == GPC_DIFF) { - parity[CLIP] = RIGHT; - } - local_min = lmt; - - // Process each scanbeam - while (scanbeam < sbt_entries) { - /* Set yb and yt to the bottom and top of the scanbeam */ - yb = sbt[scanbeam++]; - if (scanbeam < sbt_entries) { - yt = sbt[scanbeam]; - dy = yt - yb; - } - - /* === SCANBEAM BOUNDARY PROCESSING ================================ */ - /* If LMT node corresponding to yb exists */ - if (local_min) { - if (local_min->y == yb) { - /* Add edges starting at this local minimum to the AET */ - for (edge = local_min->first_bound; edge; edge = edge->next_bound) { - add_edge_to_aet(&aet, edge, NULL); - } - local_min = local_min->next; - } - } - /* Set dummy previous x value */ - /* Create bundles within AET */ - px = -DBL_MAX; - e0 = aet; - e1 = aet; - - /* Set up bundle fields of first edge */ - aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); - aet->bundle[ABOVE][!aet->type] = 0; - aet->bstate[ABOVE] = UNBUNDLED; - - for (next_edge = aet->next; next_edge; next_edge = next_edge->next) { - /* Set up bundle fields of next edge */ - next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb); - next_edge->bundle[ABOVE][!next_edge->type] = 0; - next_edge->bstate[ABOVE] = UNBUNDLED; - - /* Bundle edges above the scanbeam boundary if they coincide */ - if (next_edge->bundle[ABOVE][next_edge->type]) { - if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) && - (e0->top.y != yb)) { - next_edge->bundle[ABOVE][next_edge->type] ^= - e0->bundle[ABOVE][next_edge->type]; - next_edge->bundle[ABOVE][!next_edge->type] = - e0->bundle[ABOVE][!next_edge->type]; - next_edge->bstate[ABOVE] = BUNDLE_HEAD; - e0->bundle[ABOVE][CLIP] = 0; - e0->bundle[ABOVE][SUBJ] = 0; - e0->bstate[ABOVE] = BUNDLE_TAIL; - } - e0 = next_edge; - } - } - horiz[CLIP] = NH; - horiz[SUBJ] = NH; - - /* Process each edge at this scanbeam boundary */ - for (edge = aet; edge; edge = edge->next) { - exists[CLIP] = - edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1); - exists[SUBJ] = - edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1); - - if (exists[CLIP] || exists[SUBJ]) { - /* Set bundle side */ - edge->bside[CLIP] = parity[CLIP]; - edge->bside[SUBJ] = parity[SUBJ]; - - /* Determine contributing status and quadrant occupancies */ - switch (op) { - case GPC_DIFF: - case GPC_INT: - contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) || - (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) || - (exists[CLIP] && exists[SUBJ] && - (parity[CLIP] == parity[SUBJ])); - br = (parity[CLIP]) && (parity[SUBJ]); - bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) && - (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); - tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) && - (parity[SUBJ] ^ (horiz[SUBJ] != NH)); - tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ - edge->bundle[BELOW][CLIP]) && - (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ - edge->bundle[BELOW][SUBJ]); - break; - case GPC_XOR: - contributing = exists[CLIP] || exists[SUBJ]; - br = (parity[CLIP]) ^ (parity[SUBJ]); - bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^ - (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); - tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^ - (parity[SUBJ] ^ (horiz[SUBJ] != NH)); - tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ - edge->bundle[BELOW][CLIP]) ^ - (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ - edge->bundle[BELOW][SUBJ]); - break; - case GPC_UNION: - contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) || - (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) || - (exists[CLIP] && exists[SUBJ] && - (parity[CLIP] == parity[SUBJ])); - br = (parity[CLIP]) || (parity[SUBJ]); - bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) || - (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); - tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) || - (parity[SUBJ] ^ (horiz[SUBJ] != NH)); - tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ - edge->bundle[BELOW][CLIP]) || - (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ - edge->bundle[BELOW][SUBJ]); - break; - } - - // Update parity - parity[CLIP] ^= edge->bundle[ABOVE][CLIP]; - parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ]; - - /* Update horizontal state */ - if (exists[CLIP]) { - horiz[CLIP] = next_h_state[horiz[CLIP]] - [((exists[CLIP] - 1) << 1) + parity[CLIP]]; - } - if (exists[SUBJ]) { - horiz[SUBJ] = next_h_state[horiz[SUBJ]] - [((exists[SUBJ] - 1) << 1) + parity[SUBJ]]; - } - vclass = tr + (tl << 1) + (br << 2) + (bl << 3); - - if (contributing) { - xb = edge->xb; - switch (vclass) { - case EMN: - new_tristrip(&tlist, edge, xb, yb); - cf = edge; - break; - case ERI: - edge->outp[ABOVE] = cf->outp[ABOVE]; - if (xb != cf->xb) { - gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); - } - cf = NULL; - break; - case ELI: - gpc_vertex_create(edge, BELOW, LEFT, xb, yb); - edge->outp[ABOVE] = NULL; - cf = edge; - break; - case EMX: - if (xb != cf->xb) { - gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); - } - edge->outp[ABOVE] = NULL; - cf = NULL; - break; - case IMN: - if (cft == LED) { - if (cf->bot.y != yb) { - gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); - } - new_tristrip(&tlist, cf, cf->xb, yb); - } - edge->outp[ABOVE] = cf->outp[ABOVE]; - gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); - break; - case ILI: - new_tristrip(&tlist, edge, xb, yb); - cf = edge; - cft = ILI; - break; - case IRI: - if (cft == LED) { - if (cf->bot.y != yb) { - gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); - } - new_tristrip(&tlist, cf, cf->xb, yb); - } - gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); - edge->outp[ABOVE] = NULL; - break; - case IMX: - gpc_vertex_create(edge, BELOW, LEFT, xb, yb); - edge->outp[ABOVE] = NULL; - cft = IMX; - break; - case IMM: - gpc_vertex_create(edge, BELOW, LEFT, xb, yb); - edge->outp[ABOVE] = cf->outp[ABOVE]; - if (xb != cf->xb) { - gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb); - } - cf = edge; - break; - case EMM: - gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); - edge->outp[ABOVE] = NULL; - new_tristrip(&tlist, edge, xb, yb); - cf = edge; - break; - case LED: - if (edge->bot.y == yb) { - gpc_vertex_create(edge, BELOW, LEFT, xb, yb); - } - edge->outp[ABOVE] = edge->outp[BELOW]; - cf = edge; - cft = LED; - break; - case RED: - edge->outp[ABOVE] = cf->outp[ABOVE]; - if (cft == LED) { - if (cf->bot.y == yb) { - gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); - } else { - if (edge->bot.y == yb) { - gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); - gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); - } - } - } else { - gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); - gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); - } - cf = NULL; - break; - default: - break; - } /* End of switch */ - } /* End of contributing conditional */ - } /* End of edge exists conditional */ - } // End of AET loop - - /* Delete terminating edges from the AET, otherwise compute xt */ - for (edge = aet; edge; edge = edge->next) { - if (edge->top.y == yb) { - prev_edge = edge->prev; - next_edge = edge->next; - if (prev_edge) { - prev_edge->next = next_edge; - } else { - aet = next_edge; - } - if (next_edge) { - next_edge->prev = prev_edge; - } - - /* Copy bundle head state to the adjacent tail edge if required */ - if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) { - if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) { - prev_edge->outp[BELOW] = edge->outp[BELOW]; - prev_edge->bstate[BELOW] = UNBUNDLED; - if (prev_edge->prev) { - if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) { - prev_edge->bstate[BELOW] = BUNDLE_HEAD; - } - } - } - } - } else { - if (edge->top.y == yt) { - edge->xt = edge->top.x; - } else { - edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y); - } - } - } - - if (scanbeam < sbt_entries) { - /* === SCANBEAM INTERIOR PROCESSING ============================== */ - build_intersection_table(&it, aet, dy); - /* Process each node in the intersection table */ - for (intersect = it; intersect; intersect = intersect->next) { - e0 = intersect->ie[0]; - e1 = intersect->ie[1]; - - /* Only generate output for contributing intersections */ - if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) && - (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) { - p = e0->outp[ABOVE]; - q = e1->outp[ABOVE]; - ix = intersect->point.x; - iy = intersect->point.y + yb; - - in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) || - (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) || - (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] && - e0->bside[CLIP] && e1->bside[CLIP]); - in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) || - (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) || - (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] && - e0->bside[SUBJ] && e1->bside[SUBJ]); - - switch (op) { // Determine quadrant occupancies - case GPC_DIFF: - case GPC_INT: - tr = (in[CLIP]) && (in[SUBJ]); - tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) && - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); - br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) && - (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); - bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ - e0->bundle[ABOVE][CLIP]) && - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ - e0->bundle[ABOVE][SUBJ]); - break; - case GPC_XOR: - tr = (in[CLIP]) ^ (in[SUBJ]); - tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^ - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); - br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^ - (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); - bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ - e0->bundle[ABOVE][CLIP]) ^ - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ - e0->bundle[ABOVE][SUBJ]); - break; - case GPC_UNION: - tr = (in[CLIP]) || (in[SUBJ]); - tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) || - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); - br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) || - (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); - bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ - e0->bundle[ABOVE][CLIP]) || - (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ - e0->bundle[ABOVE][SUBJ]); - break; - } - - vclass = tr + (tl << 1) + (br << 2) + (bl << 3); - switch (vclass) { - case EMN: - new_tristrip(&tlist, e1, ix, iy); - e0->outp[ABOVE] = e1->outp[ABOVE]; - break; - case ERI: - if (p) { - gpc_p_edge(prev_edge, e0, ABOVE); - gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); - gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); - e1->outp[ABOVE] = e0->outp[ABOVE]; - e0->outp[ABOVE] = NULL; - } - break; - case ELI: - if (q) { - gpc_n_edge(next_edge, e1, ABOVE); - gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); - gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); - e0->outp[ABOVE] = e1->outp[ABOVE]; - e1->outp[ABOVE] = NULL; - } - break; - case EMX: - if (p && q) { - gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); - e0->outp[ABOVE] = NULL; - e1->outp[ABOVE] = NULL; - } - break; - case IMN: - gpc_p_edge(prev_edge, e0, ABOVE); - gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); - gpc_n_edge(next_edge, e1, ABOVE); - gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); - new_tristrip(&tlist, prev_edge, px, iy); - e1->outp[ABOVE] = prev_edge->outp[ABOVE]; - gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); - new_tristrip(&tlist, e0, ix, iy); - next_edge->outp[ABOVE] = e0->outp[ABOVE]; - gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); - break; - case ILI: - if (p) { - gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); - gpc_n_edge(next_edge, e1, ABOVE); - gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); - e1->outp[ABOVE] = e0->outp[ABOVE]; - e0->outp[ABOVE] = NULL; - } - break; - case IRI: - if (q) { - gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); - gpc_p_edge(prev_edge, e0, ABOVE); - gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); - e0->outp[ABOVE] = e1->outp[ABOVE]; - e1->outp[ABOVE] = NULL; - } - break; - case IMX: - if (p && q) { - gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); - gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); - e0->outp[ABOVE] = NULL; - e1->outp[ABOVE] = NULL; - gpc_p_edge(prev_edge, e0, ABOVE); - gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); - new_tristrip(&tlist, prev_edge, px, iy); - gpc_n_edge(next_edge, e1, ABOVE); - gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); - next_edge->outp[ABOVE] = prev_edge->outp[ABOVE]; - gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); - } - break; - case IMM: - if (p && q) { - gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); - gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); - gpc_p_edge(prev_edge, e0, ABOVE); - gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); - new_tristrip(&tlist, prev_edge, px, iy); - gpc_n_edge(next_edge, e1, ABOVE); - gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); - e1->outp[ABOVE] = prev_edge->outp[ABOVE]; - gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); - new_tristrip(&tlist, e0, ix, iy); - next_edge->outp[ABOVE] = e0->outp[ABOVE]; - gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); - } - break; - case EMM: - if (p && q) { - gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); - new_tristrip(&tlist, e1, ix, iy); - e0->outp[ABOVE] = e1->outp[ABOVE]; - } - break; - default: - break; - } /* End of switch */ - } /* End of contributing intersection conditional */ - - // Swap bundle sides in response to edge crossing - if (e0->bundle[ABOVE][CLIP]) { - e1->bside[CLIP] = !e1->bside[CLIP]; - } - if (e1->bundle[ABOVE][CLIP]) { - e0->bside[CLIP] = !e0->bside[CLIP]; - } - if (e0->bundle[ABOVE][SUBJ]) { - e1->bside[SUBJ] = !e1->bside[SUBJ]; - } - if (e1->bundle[ABOVE][SUBJ]) { - e0->bside[SUBJ] = !e0->bside[SUBJ]; - } - - /* Swap e0 and e1 bundles in the AET */ - prev_edge = e0->prev; - next_edge = e1->next; - if (e1->next) { - e1->next->prev = e0; - } - - if (e0->bstate[ABOVE] == BUNDLE_HEAD) { - search = 1; - while (search) { - prev_edge = prev_edge->prev; - if (prev_edge) { - if (prev_edge->bundle[ABOVE][CLIP] || - prev_edge->bundle[ABOVE][SUBJ] || - (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) { - search = 0; - } - } else { - search = 0; - } - } - } - if (!prev_edge) { - e1->next = aet; - aet = e0->next; - } else { - e1->next = prev_edge->next; - prev_edge->next = e0->next; - } - e0->next->prev = prev_edge; - e1->next->prev = e1; - e0->next = next_edge; - } /* End of IT loop*/ - - /* Prepare for next scanbeam */ - for (edge = aet; edge; edge = next_edge) { - next_edge = edge->next; - succ_edge = edge->succ; - - if ((edge->top.y == yt) && succ_edge) { - /* Replace AET edge by its successor */ - succ_edge->outp[BELOW] = edge->outp[ABOVE]; - succ_edge->bstate[BELOW] = edge->bstate[ABOVE]; - succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; - succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; - prev_edge = edge->prev; - if (prev_edge) { - prev_edge->next = succ_edge; - } else { - aet = succ_edge; - } - if (next_edge) { - next_edge->prev = succ_edge; - } - succ_edge->prev = prev_edge; - succ_edge->next = next_edge; - } else { - /* Update this edge */ - edge->outp[BELOW] = edge->outp[ABOVE]; - edge->bstate[BELOW] = edge->bstate[ABOVE]; - edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; - edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; - edge->xb = edge->xt; - } - edge->outp[ABOVE] = NULL; - } - } - } /* === END OF SCANBEAM PROCESSING ================================== */ - - // Generate result tristrip from tlist - result->strip = NULL; - result->num_strips = count_tristrips(tlist); - if (result->num_strips > 0) { - gpc_malloc(result->strip, - result->num_strips * sizeof(gpc_vertex_list), - const_cast("tristrip list creation")); - - s = 0; - for (tn = tlist; tn; tn = tnn) { - tnn = tn->next; - if (tn->active > 2) { - /* Valid tristrip: copy the vertices and free the heap */ - result->strip[s].num_vertices = tn->active; - gpc_malloc(result->strip[s].vertex, - tn->active * sizeof(gpc_vertex), - const_cast("tristrip creation")); - v = 0; - if (0) { - lt = tn->v[RIGHT]; - rt = tn->v[LEFT]; - } else { - lt = tn->v[LEFT]; - rt = tn->v[RIGHT]; - } - while (lt || rt) { - if (lt) { - ltn = lt->next; - result->strip[s].vertex[v].x = lt->x; - result->strip[s].vertex[v].y = lt->y; - v++; - gpc_free(lt); - lt = ltn; - } - if (rt) { - rtn = rt->next; - result->strip[s].vertex[v].x = rt->x; - result->strip[s].vertex[v].y = rt->y; - v++; - gpc_free(rt); - rt = rtn; - } - } - s++; - } else { - /* Invalid tristrip: just free the heap */ - for (lt = tn->v[LEFT]; lt; lt = ltn) { - ltn = lt->next; - gpc_free(lt); - } - for (rt = tn->v[RIGHT]; rt; rt = rtn) { - rtn = rt->next; - gpc_free(rt); - } - } - gpc_free(tn); - } - } - // Tidy up - reset_it(&it); - reset_lmt(&lmt); - gpc_free(c_heap); - gpc_free(s_heap); - gpc_free(sbt); -} // NOLINT - -} // namespace gpc - -#endif diff --git a/mobile/src/operators/math/gpc.h b/mobile/src/operators/math/gpc.h deleted file mode 100644 index 2cae7fe184..0000000000 --- a/mobile/src/operators/math/gpc.h +++ /dev/null @@ -1,222 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP -#pragma once - -#include -#include -#include -#include - -namespace gpc { - -typedef enum { // Set operation type - GPC_DIFF, // Difference - GPC_INT, // Intersection - GPC_XOR, // Exclusive or - GPC_UNION // Union -} gpc_op; - -typedef struct { // Polygon vertex structure - double x; // Vertex x component - double y; // vertex y component -} gpc_vertex; - -typedef struct { // Vertex list structure - int num_vertices; // Number of vertices in list - gpc_vertex *vertex; // Vertex array pointer -} gpc_vertex_list; - -typedef struct { // Polygon set structure - int num_contours; // Number of contours in polygon - int *hole; // Hole external contour flags - gpc_vertex_list *contour; // Contour array pointer -} gpc_polygon; - -typedef struct { // Tristrip set structure - int num_strips; // Number of tristrips - gpc_vertex_list *strip; // Tristrip array pointer -} gpc_tristrip; - -typedef enum { LEFT, RIGHT } gpc_left_right; - -typedef enum { ABOVE, BELOW } gpc_above_below; - -typedef enum { CLIP, SUBJ } gpc_clip_subj; - -typedef enum { /* Edge intersection classes */ - NUL, /* Empty non-intersection */ - EMX, /* External maximum */ - ELI, /* External left intermediate */ - TED, /* Top edge */ - ERI, /* External right intermediate */ - RED, /* Right edge */ - IMM, /* Internal maximum and minimum */ - IMN, /* Internal minimum */ - EMN, /* External minimum */ - EMM, /* External maximum and minimum */ - LED, /* Left edge */ - ILI, /* Internal left intermediate */ - BED, /* Bottom edge */ - IRI, /* Internal right intermediate */ - IMX, /* Internal maximum */ - FUL /* Full non-intersection */ -} vertex_type; - -typedef enum { /* Horizontal edge states */ - NH, /* No horizontal edge */ - BH, /* Bottom horizontal edge */ - TH /* Top horizontal edge */ -} h_state; - -typedef enum { /* Edge bundle state */ - UNBUNDLED, /* Isolated edge not within a bundle */ - BUNDLE_HEAD, /* Bundle head node */ - BUNDLE_TAIL /* Passive bundle tail node */ -} bundle_state; - -typedef struct v_shape { /* Internal vertex list datatype */ - double x; /* X coordinate component */ - double y; /* Y coordinate component */ - struct v_shape *next; /* Pointer to next vertex in list */ -} vertex_node; - -typedef struct p_shape { /* Internal contour / tristrip type */ - int active; /* Active flag / vertex count */ - int hole; /* Hole / external contour flag */ - vertex_node *v[2]; /* Left and right vertex list ptrs */ - struct p_shape *next; /* Pointer to next polygon contour */ - struct p_shape *proxy; /* Pointer to actual structure used */ -} polygon_node; - -typedef struct edge_shape { - gpc_vertex vertex; /* Piggy-backed contour vertex data */ - gpc_vertex bot; /* Edge lower (x, y) coordinate */ - gpc_vertex top; /* Edge upper (x, y) coordinate */ - double xb; /* Scanbeam bottom x coordinate */ - double xt; /* Scanbeam top x coordinate */ - double dx; /* Change in x for a unit y increase */ - int type; /* Clip / subject edge flag */ - int bundle[2][2]; /* Bundle edge flags */ - int bside[2]; /* Bundle left / right indicators */ - bundle_state bstate[2]; /* Edge bundle state */ - polygon_node *outp[2]; /* Output polygon / tristrip pointer */ - struct edge_shape *prev; /* Previous edge in the AET */ - struct edge_shape *next; /* Next edge in the AET */ - struct edge_shape *pred; /* Edge connected at the lower end */ - struct edge_shape *succ; /* Edge connected at the upper end */ - struct edge_shape *next_bound; /* Pointer to next bound in LMT */ -} edge_node; - -inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); } - -inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); } - -inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); } - -inline int gpc_next_index(int i, int n) { return ((i + 1) % n); } - -inline int gpc_optimal(gpc_vertex *v, int i, int n) { - return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y); -} - -inline int gpc_fwd_min(edge_node *v, int i, int n) { - return (v[(i + 1) % n].vertex.y > v[i].vertex.y && - v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y); -} - -inline int gpc_not_fmax(edge_node *v, int i, int n) { - return (v[(i + 1) % n].vertex.y > v[i].vertex.y); -} - -inline int gpc_rev_min(edge_node *v, int i, int n) { - return (v[(i + 1) % n].vertex.y >= v[i].vertex.y && - v[(i - 1 + n) % n].vertex.y > v[i].vertex.y); -} - -inline int gpc_not_rmax(edge_node *v, int i, int n) { - return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y); -} - -// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j) -// { -inline void gpc_p_edge(edge_node *d, edge_node *e, int p) { - d = e; - do { - d = d->prev; - } while (!d->outp[p]); - // i = d->bot.x + d->dx * (j - d->bot.y); -} - -// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j) -// { -inline void gpc_n_edge(edge_node *d, edge_node *e, int p) { - d = e; - do { - d = d->next; - } while (!d->outp[p]); - // i = d->bot.x + d->dx * (j - d->bot.y); -} - -template -void gpc_malloc(T *&p, int b, char *s) { // NOLINT - if (b > 0) { - p = reinterpret_cast(malloc(b)); - - if (!p) { - fprintf(stderr, "gpc malloc failure: %s\n", s); - exit(0); - } - } else { - p = NULL; - } -} - -template -void gpc_free(T *&p) { // NOLINT - if (p) { - free(p); - p = NULL; - } -} - -/* -=========================================================================== - Public Function Prototypes -=========================================================================== -*/ - -void add_vertex(vertex_node **t, double x, double y); - -void gpc_vertex_create(edge_node *e, int p, int s, double x, double y); - -void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole); - -void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon, - gpc_polygon *clip_polygon, gpc_polygon *result_polygon); - -void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon, - gpc_polygon *clip_polygon, - gpc_tristrip *result_tristrip); - -void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip); - -void gpc_free_polygon(gpc_polygon *polygon); - -void gpc_free_tristrip(gpc_tristrip *tristrip); - -} // namespace gpc - -#endif diff --git a/mobile/src/operators/math/gru_compute.cpp b/mobile/src/operators/math/gru_compute.cpp deleted file mode 100644 index d30ea5aa47..0000000000 --- a/mobile/src/operators/math/gru_compute.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_OP - -#include "operators/math/gru_compute.h" -#include "common/types.h" -#include "operators/math/activation.h" -#include "operators/math/gemm/cblas.h" -#include "operators/math/gru_cpu_kernel.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -struct GRUUnitFunctor { - static void compute(GRUMetaValue value, int frame_size, int batch_size, - const ActivationType active_node, - const ActivationType active_gate) { - if (value.prev_out_value) { - cblas_sgemm(false, false, batch_size, frame_size * 2, frame_size, 1.f, - value.prev_out_value, frame_size, value.gate_weight, - frame_size * 2, 1.f, value.gate_value, frame_size * 3); - } - - forward_reset_output(value, frame_size, batch_size, active_gate); - - if (value.prev_out_value) { - cblas_sgemm(false, false, batch_size, frame_size, frame_size, 1.f, - value.reset_output_value, frame_size, value.state_weight, - frame_size, 1.f, value.gate_value + frame_size * 2, - frame_size * 3); - } - - forward_final_output(value, frame_size, batch_size, active_node); - } -}; - -template struct GRUUnitFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/math/gru_compute.h b/mobile/src/operators/math/gru_compute.h deleted file mode 100644 index 00f4da9022..0000000000 --- a/mobile/src/operators/math/gru_compute.h +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef GRU_OP -#pragma once - -#include "operators/math/activation.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -struct GRUMetaValue { - T *gate_weight; - T *state_weight; - T *gate_value; - T *reset_output_value; - T *output_value; - T *prev_out_value; -}; - -template -struct GRUUnitFunctor { - static void compute(GRUMetaValue value, int frame_size, int batch_size, - const ActivationType active_node, - const ActivationType active_gate); -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/mobile/src/operators/math/gru_cpu_kernel.h b/mobile/src/operators/math/gru_cpu_kernel.h deleted file mode 100644 index a010fb616b..0000000000 --- a/mobile/src/operators/math/gru_cpu_kernel.h +++ /dev/null @@ -1,203 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef GRU_OP - -#pragma once - -#include -#include "operators/math/activation.h" -#include "operators/math/gru_compute.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -void hl_naive_gru_forward_reset_output(T *gate_value, T *reset_output_value, - T *prev_output_value, int frame_size) { - T r_value_update_gate; - T r_value_reset_gate; - T r_value_reset_output; - T r_prev_out = 0; - T *update_gate = gate_value; - T *reset_gate = gate_value + frame_size; - - int remain = frame_size; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - int loop = remain >> 3; - remain = remain & 0x7; - float32x4_t prev0 = vdupq_n_f32(0.f); - float32x4_t prev1 = vdupq_n_f32(0.f); - for (int i = 0; i < loop; ++i) { - float32x4_t update0 = vld1q_f32(update_gate); - float32x4_t update1 = vld1q_f32(update_gate + 4); - float32x4_t reset0 = vld1q_f32(reset_gate); - float32x4_t reset1 = vld1q_f32(reset_gate + 4); - if (prev_output_value) { - prev0 = vld1q_f32(prev_output_value); - prev1 = vld1q_f32(prev_output_value + 4); - prev_output_value += 8; - } - update0 = vActiveq_f32(update0); - update1 = vActiveq_f32(update1); - reset0 = vActiveq_f32(reset0); - reset1 = vActiveq_f32(reset1); - float32x4_t output0 = vmulq_f32(prev0, reset0); - float32x4_t output1 = vmulq_f32(prev1, reset1); - vst1q_f32(update_gate, update0); - vst1q_f32(update_gate + 4, update1); - vst1q_f32(reset_gate, reset0); - vst1q_f32(reset_gate + 4, reset1); - vst1q_f32(reset_output_value, output0); - vst1q_f32(reset_output_value + 4, output1); - update_gate += 8; - reset_gate += 8; - reset_output_value += 8; - } -#endif // __ARM_NEON__ - for (int i = 0; i < remain; i++) { - r_value_update_gate = update_gate[i]; - r_value_reset_gate = reset_gate[i]; - if (prev_output_value) { - r_prev_out = prev_output_value[i]; - } - r_value_update_gate = Active(r_value_update_gate); - r_value_reset_gate = Active(r_value_reset_gate); - r_value_reset_output = r_prev_out * r_value_reset_gate; - update_gate[i] = r_value_update_gate; - reset_gate[i] = r_value_reset_gate; - reset_output_value[i] = r_value_reset_output; - } -} - -template -void hl_naive_gru_forward_final_output(T *gate_value, T *prev_output_value, - T *output_value, int frame_size) { - T r_value_update_gate; - T r_value_frame_state; - T r_prev_out = 0; - T r_output; - T *update_gate = gate_value; - T *frame_state = gate_value + frame_size * 2; - - int remain = frame_size; -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - int loop = remain >> 3; - remain = remain & 0x7; - float32x4_t prev0 = vdupq_n_f32(0.f); - float32x4_t prev1 = vdupq_n_f32(0.f); - for (int i = 0; i < loop; ++i) { - float32x4_t update0 = vld1q_f32(update_gate); - float32x4_t update1 = vld1q_f32(update_gate + 4); - float32x4_t state0 = vld1q_f32(frame_state); - float32x4_t state1 = vld1q_f32(frame_state + 4); - if (prev_output_value) { - prev0 = vld1q_f32(prev_output_value); - prev1 = vld1q_f32(prev_output_value + 4); - prev_output_value += 8; - } - state0 = vActiveq_f32(state0); - state1 = vActiveq_f32(state1); - float32x4_t output0 = vmlsq_f32(prev0, update0, prev0); - float32x4_t output1 = vmlsq_f32(prev1, update1, prev1); - output0 = vmlaq_f32(output0, update0, state0); - output1 = vmlaq_f32(output1, update1, state1); - vst1q_f32(frame_state, state0); - vst1q_f32(frame_state + 4, state1); - vst1q_f32(output_value, output0); - vst1q_f32(output_value + 4, output1); - update_gate += 8; - frame_state += 8; - output_value += 8; - } -#endif // __ARM_NEON__ - for (int i = 0; i < remain; i++) { - r_value_update_gate = update_gate[i]; - r_value_frame_state = frame_state[i]; - if (prev_output_value) { - r_prev_out = prev_output_value[i]; - } - r_value_frame_state = Active(r_value_frame_state); - r_output = r_prev_out - r_value_update_gate * r_prev_out + - r_value_update_gate * r_value_frame_state; - frame_state[i] = r_value_frame_state; - output_value[i] = r_output; - } -} - -#define FORWARD_RESET_OUTPUT(active_type, value, frame_size) \ - hl_naive_gru_forward_reset_output( \ - value.gate_value, value.reset_output_value, value.prev_out_value, \ - frame_size); - -template -inline void forward_reset_output(GRUMetaValue value, int frame_size, - int batch_size, ActivationType active_node) { - for (int b = 0; b < batch_size; ++b) { - switch (active_node) { - case RELU: - FORWARD_RESET_OUTPUT(RELU, value, frame_size); - break; - case SIGMOID: - FORWARD_RESET_OUTPUT(SIGMOID, value, frame_size); - break; - case TANH: - FORWARD_RESET_OUTPUT(TANH, value, frame_size); - break; - default: - FORWARD_RESET_OUTPUT(IDENTITY, value, frame_size); - } - value.gate_value += frame_size * 3; - value.reset_output_value += frame_size; - if (value.prev_out_value) { - value.prev_out_value += frame_size; - } - } -} - -#define FORWARD_FINAL_OUTPUT(active_type, value, frame_size) \ - hl_naive_gru_forward_final_output( \ - value.gate_value, value.prev_out_value, value.output_value, frame_size) - -template -inline void forward_final_output(GRUMetaValue value, int frame_size, - int batch_size, ActivationType active_node) { - for (int b = 0; b < batch_size; ++b) { - switch (active_node) { - case RELU: - FORWARD_FINAL_OUTPUT(RELU, value, frame_size); - break; - case SIGMOID: - FORWARD_FINAL_OUTPUT(SIGMOID, value, frame_size); - break; - case TANH: - FORWARD_FINAL_OUTPUT(TANH, value, frame_size); - break; - default: - FORWARD_FINAL_OUTPUT(IDENTITY, value, frame_size); - } - value.gate_value += frame_size * 3; - value.output_value += frame_size; - if (value.prev_out_value) { - value.prev_out_value += frame_size; - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/im2col.cpp b/mobile/src/operators/math/im2col.cpp deleted file mode 100644 index a7b97e5bfc..0000000000 --- a/mobile/src/operators/math/im2col.cpp +++ /dev/null @@ -1,668 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#ifdef __ARM_NEON -#include -#endif -#include -#include "common/types.h" -#include "operators/math/im2col.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template <> -void ExtractToImg(const float *im_data, float *col_data, - const int im_height, const int im_width, - const int col_height, const int col_width, - const int padding_h, const int padding_w, - const int stride_h, const int stride_w, const int kh, - const int kw) { - int h = padding_h - kh; - int w = padding_w - kw; - int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0; - int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0; - int start_height = kh + col_start_height * stride_h - padding_h; - int start_width = kw + col_start_width * stride_w - padding_w; - - int end_height = (col_height - col_start_height) * stride_h + start_height; - end_height = end_height > im_height ? im_height : end_height; - int end_width = (col_width - col_start_width) * stride_w + start_width; - end_width = end_width > im_width ? im_width : end_width; - int extract = (end_width - start_width + stride_w - 1) / stride_w; - - im_data += start_height * im_width + start_width; - col_data += col_start_height * col_width + col_start_width; - for (int i = start_height; i < end_height; i += stride_h) { - int s = 0; - if (stride_w == 1) { -#if __ARM_NEON - for (; s < extract - 3; s += 4) { - float32x4_t _img = vld1q_f32(im_data + s); - vst1q_f32(col_data + s, _img); - } -#endif - for (; s < extract; ++s) { - col_data[s] = im_data[s]; - } - } else if (stride_w == 2) { -#if __ARM_NEON - for (; s < extract - 3; s += 4) { - float32x4x2_t _img = vld2q_f32(im_data + s * 2); - vst1q_f32(col_data + s, _img.val[0]); - } -#endif - for (; s < extract; ++s) { - col_data[s] = im_data[s * 2]; - } - } else if (stride_w == 3) { -#if __ARM_NEON - for (; s < extract - 3; s += 4) { - float32x4x3_t _img = vld3q_f32(im_data + s * 3); - vst1q_f32(col_data + s, _img.val[0]); - } -#endif - for (; s < extract; ++s) { - col_data[s] = im_data[s * 3]; - } - } else if (stride_w == 4) { -#if __ARM_NEON - for (; s < extract - 3; s += 4) { - float32x4x4_t _img = vld4q_f32(im_data + s * 4); - vst1q_f32(col_data + s, _img.val[0]); - } -#endif - for (; s < extract; ++s) { - col_data[s] = im_data[s * 4]; - } - } else { - PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4."); - } - im_data += im_width * stride_h; - col_data += col_width; - } -} - -template <> -void ExtractToImg(const int8_t *im_data, int8_t *col_data, - const int im_height, const int im_width, - const int col_height, const int col_width, - const int padding_h, const int padding_w, - const int stride_h, const int stride_w, const int kh, - const int kw) { - int h = padding_h - kh; - int w = padding_w - kw; - int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0; - int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0; - int start_height = kh + col_start_height * stride_h - padding_h; - int start_width = kw + col_start_width * stride_w - padding_w; - - int end_height = (col_height - col_start_height) * stride_h + start_height; - end_height = end_height > im_height ? im_height : end_height; - int end_width = (col_width - col_start_width) * stride_w + start_width; - end_width = end_width > im_width ? im_width : end_width; - int extract = (end_width - start_width + stride_w - 1) / stride_w; - - im_data += start_height * im_width + start_width; - col_data += col_start_height * col_width + col_start_width; - for (int i = start_height; i < end_height; i += stride_h) { - int s = 0; - if (stride_w == 1) { - for (; s < extract - 15; s += 16) { - int8x16_t _img = vld1q_s8(im_data + s); - vst1q_s8(col_data + s, _img); - } - for (; s < extract; ++s) { - col_data[s] = im_data[s]; - } - } else if (stride_w == 2) { -#if __ARM_NEON - for (; s < extract - 15; s += 16) { - int8x16x2_t _img = vld2q_s8(im_data + s * 2); - vst1q_s8(col_data + s, _img.val[0]); - } -#endif - for (; s < extract; ++s) { - col_data[s] = im_data[s * 2]; - } - } else if (stride_w == 3) { -#if __ARM_NEON - for (; s < extract - 15; s += 16) { - int8x16x3_t img = vld3q_s8(im_data + s * 3); - vst1q_s8(col_data + s, img.val[0]); - } -#endif - for (; s < extract; ++s) { - col_data[s] = im_data[s * 3]; - } - } else if (stride_w == 4) { -#if __ARM_NEON - for (; s < extract - 15; s += 16) { - int8x16x4_t img = vld4q_s8(im_data + s * 4); - vst1q_s8(col_data + s, img.val[0]); - } -#endif - for (; s < extract; ++s) { - col_data[s] = im_data[s * 4]; - } - } else { - PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4."); - } - im_data += im_width * stride_h; - col_data += col_width; - } -} - -/* - * im = [input_channels, input_height, input_width] - * col = - * [input_channels, filter_height, filter_width, output_height, - * output_width] - */ -template -class Im2ColFunctor { - public: - void operator()(const framework::Tensor &im, const std::vector &dilation, - const std::vector &stride, - const std::vector &padding, framework::Tensor *col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int col_height = col->dims()[3]; - int col_width = col->dims()[4]; - - int channels_col = im_channels * filter_height * filter_width; - const T *im_data = im.data(); - T *col_data = col->data(); -#if __ARM_NEON - if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) { - int im_spatial_size = im_height * im_width; - int col_spatial_size = col_height * col_width; - // pad 0 - memset(col_data, 0, col->numel() * sizeof(T)); - - #pragma omp parallel for - for (int ic = 0; ic < im_channels; ++ic) { - const T *local_im_data = im_data + ic * im_spatial_size; - T *local_col_data = - col_data + ic * filter_height * filter_width * col_spatial_size; - for (int kh = 0; kh < filter_height; ++kh) { - for (int kw = 0; kw < filter_width; ++kw) { - ExtractToImg(local_im_data, local_col_data, im_height, im_width, - col_height, col_width, padding[0], padding[1], - stride[0], stride[1], kh, kw); - local_col_data += col_spatial_size; - } - } - } - } else { -#endif - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < col_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < col_width; ++w) { - int im_col_idx = - w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * col_height + h) * col_width + w; - int im_idx = - (im_row_idx + c_im * im_height) * im_width + im_col_idx; - - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) - : im_data[im_idx]; - } - } - } -#if __ARM_NEON - } -#endif - } -}; - -template <> -void ExtendToImg(const float *col_data, float *im_data, - const int im_height, const int im_width, - const int col_height, const int col_width, - const int padding_h, const int padding_w, - const int stride_h, const int stride_w, const int kh, - const int kw) { - int h = padding_h - kh; - int w = padding_w - kw; - int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0; - int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0; - int start_height = kh + col_start_height * stride_h - padding_h; - int start_width = kw + col_start_width * stride_w - padding_w; - - int end_height = (col_height - col_start_height) * stride_h + start_height; - end_height = end_height > im_height ? im_height : end_height; - int end_width = (col_width - col_start_width) * stride_w + start_width; - end_width = end_width > im_width ? im_width : end_width; - // int extract = (end_width - start_width + stride_w - 1) / stride_w; - int extend = end_width - start_width; - - im_data += start_height * im_width + start_width; - col_data += col_start_height * col_width + col_start_width; - - for (int i = start_height; i < end_height; i += stride_h) { - int s = 0; - if (stride_w == 1) { -#if __ARM_NEON - for (; s < extend - 3; s += 4) { - float32x4_t _col = vld1q_f32(col_data + s); - float32x4_t _img = vld1q_f32(im_data + s); - _img = vaddq_f32(_img, _col); - vst1q_f32(im_data + s, _img); - } -#endif - for (; s < extend; ++s) { - im_data[s] += col_data[s]; - } - } else if (stride_w == 2) { -#if __ARM_NEON - for (; s < extend - 7; s += 8) { - float32x4_t _col = vld1q_f32(col_data + s / 2); - float32x4x2_t _img = vld2q_f32(im_data + s); - _img.val[0] = vaddq_f32(_img.val[0], _col); - vst2q_f32(im_data + s, _img); - } -#endif - for (; s < extend; s += 2) { - im_data[s] += col_data[s / 2]; - } - } else { - PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2."); - } - im_data += im_width * stride_h; - col_data += col_width; - } -} - -template <> -void ExtendToImgV2(const float *col_data, float *im_data, - const int im_height, const int im_width, - const int col_height, const int col_width, - const int padding_h, const int padding_w, - const int stride_h, const int stride_w, const int kh, - const int kernel_w) { - int col_spatial_size = col_height * col_width; - int h = padding_h - kh; - int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0; - int start_height = kh + col_start_height * stride_h - padding_h; - int end_height = (col_height - col_start_height) * stride_h + start_height; - end_height = end_height > im_height ? im_height : end_height; - im_data += start_height * im_width; - col_data += col_start_height * col_width; - - int kw = 0; - for (; kw < kernel_w - 1; kw += 2) { - int w0 = padding_w - kw; - int w1 = padding_w - (kw + 1); - int col_start_width0 = w0 > 0 ? (w0 + stride_w - 1) / stride_w : 0; - int col_start_width1 = w1 > 0 ? (w1 + stride_w - 1) / stride_w : 0; - int start_width0 = kw + col_start_width0 * stride_w - padding_w; - int start_width1 = (kw + 1) + col_start_width1 * stride_w - padding_w; - - int end_width0 = (col_width - col_start_width0) * stride_w + start_width0; - end_width0 = end_width0 > im_width ? im_width : end_width0; - int end_width1 = (col_width - col_start_width1) * stride_w + start_width1; - end_width1 = end_width1 > im_width ? im_width : end_width1; - int start_width = 0; - int end_width = 0; - if (stride_w == 1) { - start_width = std::max(start_width0, start_width1); - end_width = std::min(end_width0, end_width1); - } else if (stride_w == 2) { - start_width = std::min(start_width0, start_width1); - end_width = std::min(end_width0, end_width1); - } else { - PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2."); - } - - // DLOG << "start_width0: " << start_width0 << ", end_width0: " << - // end_width0; DLOG << "start_width1: " << start_width1 << ", end_width1: - // " << end_width1; - int extend = end_width - start_width; - float *im_data01 = im_data + start_width; - float *im_data0 = im_data + start_width0; - float *im_data1 = im_data + start_width1; - const float *col_data0 = col_data + col_start_width0; - const float *col_data1 = col_data + col_spatial_size + col_start_width1; - - for (int i = start_height; i < end_height; i += stride_h) { - int s = 0; - if (stride_w == 1) { - int offset0 = start_width - start_width0; - int offset1 = start_width - start_width1; - for (int ss = 0; ss < start_width - start_width0; ++ss) { - im_data0[ss] += col_data0[ss]; - } - for (int ss = 0; ss < start_width - start_width1; ++ss) { - im_data1[ss] += col_data1[ss]; - } -#if __ARM_NEON - for (; s < extend - 3; s += 4) { - float32x4_t _col0 = vld1q_f32(col_data0 + offset0 + s); - float32x4_t _col1 = vld1q_f32(col_data1 + offset1 + s); - float32x4_t _img = vld1q_f32(im_data01 + s); - _img = vaddq_f32(_img, _col0); - _img = vaddq_f32(_img, _col1); - vst1q_f32(im_data01 + s, _img); - } -#endif - for (int ss = s; ss < end_width0 - start_width0; ++ss) { - im_data0[ss] += col_data0[ss]; - } - for (int ss = s; ss < end_width1 - start_width1; ++ss) { - im_data1[ss] += col_data1[ss]; - } - } else if (stride_w == 2) { - if (start_width0 < start_width1) { -#if __ARM_NEON - for (; s < extend - 7; s += 8) { - float32x4_t _col0 = vld1q_f32(col_data0 + s / 2); - float32x4_t _col1 = vld1q_f32(col_data1 + s / 2); - float32x4x2_t _img = vld2q_f32(im_data01 + s); - _img.val[0] = vaddq_f32(_img.val[0], _col0); - _img.val[1] = vaddq_f32(_img.val[1], _col1); - vst2q_f32(im_data01 + s, _img); - } -#endif - } else { -#if __ARM_NEON - for (; s < extend - 7; s += 8) { - float32x4_t _col0 = vld1q_f32(col_data0 + s / 2); - float32x4_t _col1 = vld1q_f32(col_data1 + s / 2); - float32x4x2_t _img = vld2q_f32(im_data01 + s); - _img.val[0] = vaddq_f32(_img.val[0], _col1); - _img.val[1] = vaddq_f32(_img.val[1], _col0); - vst2q_f32(im_data01 + s, _img); - } -#endif - } - for (int ss = s; ss < end_width0 - start_width0; ss += 2) { - im_data0[ss] += col_data0[ss / 2]; - } - for (int ss = s; ss < end_width1 - start_width1; ss += 2) { - im_data1[ss] += col_data1[ss / 2]; - } - } - - im_data0 += im_width * stride_h; - im_data1 += im_width * stride_h; - im_data01 += im_width * stride_h; - col_data0 += col_width; - col_data1 += col_width; - } - col_data += 2 * col_spatial_size; - } - - for (; kw < kernel_w; ++kw) { - int w = padding_w - kw; - int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0; - int start_width = kw + col_start_width * stride_w - padding_w; - - int end_width = (col_width - col_start_width) * stride_w + start_width; - end_width = end_width > im_width ? im_width : end_width; - int extend = end_width - start_width; - - float *im_data0 = im_data + start_width; - const float *col_data0 = col_data + col_start_width; - - for (int i = start_height; i < end_height; i += stride_h) { - int s = 0; - if (stride_w == 1) { -#if __ARM_NEON - for (; s < extend - 3; s += 4) { - float32x4_t _col = vld1q_f32(col_data + s); - float32x4_t _img = vld1q_f32(im_data + s); - _img = vaddq_f32(_img, _col); - vst1q_f32(im_data + s, _img); - } -#endif - for (; s < extend; ++s) { - im_data[s] += col_data[s]; - } - } else if (stride_w == 2) { -#if __ARM_NEON - for (; s < extend - 7; s += 8) { - float32x4_t _col = vld1q_f32(col_data + s / 2); - float32x4x2_t _img = vld2q_f32(im_data + s); - _img.val[0] = vaddq_f32(_img.val[0], _col); - vst2q_f32(im_data + s, _img); - } -#endif - for (; s < extend; s += 2) { - im_data[s] += col_data[s / 2]; - } - } else { - PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2."); - } - im_data += im_width * stride_h; - col_data += col_width; - } - col_data += col_spatial_size; - } -} - -/* - * im = [input_channels, input_height, input_width] - * col = - * [input_channels, filter_height, filter_width, output_height, - * output_width] - */ -template -class Col2ImFunctor { - public: - void operator()(const framework::Tensor &col, - const std::vector &dilation, - const std::vector &stride, - const std::vector &padding, framework::Tensor *im) { - int im_channels = im->dims()[0]; - int im_height = im->dims()[1]; - int im_width = im->dims()[2]; - int filter_height = col.dims()[1]; - int filter_width = col.dims()[2]; - int col_height = col.dims()[3]; - int col_width = col.dims()[4]; - - int channels_col = im_channels * filter_height * filter_width; - const T *col_data = col.data(); - T *im_data = im->data(); - memset(static_cast(im_data), 0, sizeof(T) * im->numel()); - -#if __ARM_NEON - if (stride[0] <= 2 && dilation[0] == 1 && dilation[0] == dilation[1]) { - int im_spatial_size = im_height * im_width; - int col_spatial_size = col_height * col_width; - - #pragma omp parallel for - for (int ic = 0; ic < im_channels; ++ic) { - T *local_im_data = im_data + ic * im_spatial_size; - const T *local_col_data = - col_data + ic * filter_height * filter_width * col_spatial_size; - for (int kh = 0; kh < filter_height; ++kh) { -#if 0 - for (int kw = 0; kw < filter_width; ++kw) { - ExtendToImg(local_col_data, local_im_data, im_height, im_width, - col_height, col_width, padding[0], padding[1], - stride[0], stride[1], kh, kw); - local_col_data += col_spatial_size; - } -#else - ExtendToImgV2(local_col_data, local_im_data, im_height, im_width, - col_height, col_width, padding[0], padding[1], - stride[0], stride[1], kh, filter_width); - local_col_data += col_spatial_size * filter_width; -#endif - } - } - } else { -#endif - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < col_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < col_width; ++w) { - int im_col_idx = - w * stride[1] - padding[1] + w_offset * dilation[1]; - if ((im_row_idx) >= 0 && (im_row_idx) < im_height && - (im_col_idx) >= 0 && (im_col_idx) < im_width) { - im_data[(im_row_idx + c_im * im_height) * im_width + - im_col_idx] += - col_data[(c * col_height + h) * col_width + w]; - } - } - } - } -#if __ARM_NEON - } -#endif - } -}; - -template class Im2ColFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -// template class Col2ImFunctor; - -/* - * im = [input_channels, input_height, input_width] - * col = - * [output_height, output_width, input_channels, filter_height, - * filter_width] - */ -template -class Im2ColFunctor { - public: - void operator()(const framework::Tensor &im, const std::vector &dilation, - const std::vector &stride, - const std::vector &padding, framework::Tensor *col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[3]; - int filter_width = col->dims()[4]; - int col_height = col->dims()[0]; - int col_width = col->dims()[1]; - - const T *im_data = im.data(); - T *col_data = col->data(); - for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { - for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { - for (int channel = 0; channel < im_channels; ++channel) { - for (int filter_row_idx = 0; filter_row_idx < filter_height; - ++filter_row_idx) { - int im_row_offset = - col_row_idx * stride[0] + filter_row_idx - padding[0]; - for (int filter_col_idx = 0; filter_col_idx < filter_width; - ++filter_col_idx) { - int im_col_offset = - col_col_idx * stride[1] + filter_col_idx - padding[1]; - int col_offset = - ((((col_row_idx)*col_width + col_col_idx) * im_channels + - channel) * - filter_height + - filter_row_idx) * - filter_width + - filter_col_idx; - int im_offset = (channel * im_height + im_row_offset) * im_width + - im_col_offset; - col_data[col_offset] = - (im_row_offset < 0 || im_row_offset >= im_height || - im_col_offset < 0 || im_col_offset >= im_width) - ? static_cast(0) - : im_data[im_offset]; - } - } - } - } - } - } -}; - -/* - * im = [input_channels, input_height, input_width] - * col = - * [output_height, output_width, input_channels, filter_height, - * filter_width] - */ -template -class Col2ImFunctor { - public: - void operator()(const framework::Tensor &col, - const std::vector &dilation, - const std::vector &stride, - const std::vector &padding, framework::Tensor *im) { - int im_channels = im->dims()[0]; - int im_height = im->dims()[1]; - int im_width = im->dims()[2]; - int filter_height = col.dims()[3]; - int filter_width = col.dims()[4]; - int col_height = col.dims()[0]; - int col_width = col.dims()[1]; - - T *im_data = im->data(); - const T *col_data = col.data(); - - for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { - for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { - for (int channel = 0; channel < im_channels; ++channel) { - for (int filter_row_idx = 0; filter_row_idx < filter_height; - ++filter_row_idx) { - int im_row_offset = - col_row_idx * stride[0] + filter_row_idx - padding[0]; - for (int filter_col_idx = 0; filter_col_idx < filter_width; - ++filter_col_idx) { - int im_col_offset = - col_col_idx * stride[1] + filter_col_idx - padding[1]; - - int col_offset = - (((col_row_idx * col_width + col_col_idx) * im_channels + - channel) * - filter_height + - filter_row_idx) * - filter_width + - filter_col_idx; - - if (im_row_offset >= 0 && im_row_offset < im_height && - im_col_offset >= 0 && im_col_offset < im_width) { - int im_offset = - (channel * im_height + im_row_offset) * im_width + - im_col_offset; - im_data[im_offset] += col_data[col_offset]; - } - } - } - } - } - } - } -}; - -template class Im2ColFunctor; -template class Col2ImFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/im2col.h b/mobile/src/operators/math/im2col.h deleted file mode 100644 index 347f72c917..0000000000 --- a/mobile/src/operators/math/im2col.h +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -/* The storage format of the coldata in the Im2ColFunctor and - * Col2ImFunctor. */ -enum class ColFormat { kCFO = 0, kOCF = 1 }; - -template -void ExtractToImg(const T *im_data, T *col_data, const int im_height, - const int im_width, const int col_height, const int col_width, - const int padding_h, const int padding_w, const int stride_h, - const int stride_w, const int kh, const int kw); - -template -void ExtendToImg(const T *col_data, T *im_data, const int im_height, - const int im_width, const int col_height, const int col_width, - const int padding_h, const int padding_w, const int stride_h, - const int stride_w, const int kh, const int kw); - -template -void ExtendToImgV2(const T *col_data, T *im_data, const int im_height, - const int im_width, const int col_height, - const int col_width, const int padding_h, - const int padding_w, const int stride_h, const int stride_w, - const int kh, const int kernel_w); - -/* - * \brief Converts the image data of three dimensions(CHW) into a - * colData of - * five dimensions in the Im2ColFunctor calculation, - * And in the Col2ImFunctor calculation, it is reversed. - * - * \param imData Image data. - * \param imShape The shape of imData, - * [input_channels, input_height, input_width]. - * \param colData Column data. - * \param colShape The shape of colData. - * - * \param dilations dilation data. - * \param 2-dimension [dilation_height, dilation_width]. - * - * \param strides stride data. - * \param 2-dimension [stride_height, stride_width]. - * - * \param paddings padding data. - * \param 4-dimension [up_pad, left_pad, down_pad, right_pad]. - * - * If the template argument Format is kCFO, the shape of colData is: - * [input_channels, filter_height, filter_width, output_height, - * output_width] - * So, it is easy to reshape into a convolution matrix for - * convolution - * calculation based on matrix multiplication. - * The shape of convolution matrix is [height, width], where the - * height is equal - * input_channels * filter_height * filter_width, and the width is - * equal - * output_height * output_width. - * - * Reshape: - * shape of colData shape of convolution matrix - * [input_channels, - * filter_height, - * filter_width, ======> [height, width] - * output_height, - * output_width] - * - * If the template argument Format is kOCF, the shape of colData is: - * [output_height, output_width, input_channels, filter_height, - * filter_width] - * So, it is easy to reshape into a sequence matrix for rnn - * calculation. - * The shape of sequence matrix is [seq_length, step_size], where - * the seq_length - * is equal output_height * output_width, and the step_size is equal - * input_channels * filter_height * filter_width. - * - * Reshape: - * shape of colData shape of sequence matrix - * [output_height, - * output_width, - * input_channels, ======> [seqLength, stepSize] - * filter_height, - * filter_width] - * - * \note The caller needs to ensure that imShape.inputChannels is - * equal to - * colShape.inputChannels. - */ -template -class Im2ColFunctor { - public: - void operator()(const framework::Tensor &im, const std::vector &dilation, - const std::vector &stride, - const std::vector &padding, framework::Tensor *col); -}; - -template -class Col2ImFunctor { - public: - void operator()(const framework::Tensor &col, - const std::vector &dilation, - const std::vector &stride, - const std::vector &padding, framework::Tensor *im); -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/math.h b/mobile/src/operators/math/math.h deleted file mode 100644 index 8ff5019e31..0000000000 --- a/mobile/src/operators/math/math.h +++ /dev/null @@ -1,342 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* NEON implementation of sin, cos, exp and log - * - * Inspired by Intel Approximate Math library, and based on the - * corresponding algorithms of the cephes math library - */ - -/* Copyright (C) 2011 Julien Pommier - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - * - * (this is the zlib license) - */ - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - -#pragma once - -#include - -#define c_inv_mant_mask ~0x7f800000u -#define c_cephes_SQRTHF 0.707106781186547524 -#define c_cephes_log_p0 7.0376836292E-2 -#define c_cephes_log_p1 -1.1514610310E-1 -#define c_cephes_log_p2 1.1676998740E-1 -#define c_cephes_log_p3 -1.2420140846E-1 -#define c_cephes_log_p4 +1.4249322787E-1 -#define c_cephes_log_p5 -1.6668057665E-1 -#define c_cephes_log_p6 +2.0000714765E-1 -#define c_cephes_log_p7 -2.4999993993E-1 -#define c_cephes_log_p8 +3.3333331174E-1 -#define c_cephes_log_q1 -2.12194440e-4 -#define c_cephes_log_q2 0.693359375 - -/* natural logarithm computed for 4 simultaneous float - * return NaN for x <= 0 - */ -static inline float32x4_t log_ps(float32x4_t x) { - float32x4_t one = vdupq_n_f32(1); - - x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ - uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); - - int32x4_t ux = vreinterpretq_s32_f32(x); - - int32x4_t emm0 = vshrq_n_s32(ux, 23); - - /* keep only the fractional part */ - ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); - ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); - x = vreinterpretq_f32_s32(ux); - - emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); - float32x4_t e = vcvtq_f32_s32(emm0); - - e = vaddq_f32(e, one); - - /* part2: - * if( x < SQRTHF ) { - * e -= 1; - * x = x + x - 1.0; - * } else { x = x - 1.0; } - */ - uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); - float32x4_t tmp = - vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); - x = vsubq_f32(x, one); - e = vsubq_f32( - e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); - x = vaddq_f32(x, tmp); - - float32x4_t z = vmulq_f32(x, x); - - float32x4_t y = vdupq_n_f32(c_cephes_log_p0); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); - y = vmulq_f32(y, x); - - y = vmulq_f32(y, z); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); - y = vaddq_f32(y, tmp); - - tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); - y = vsubq_f32(y, tmp); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); - x = vaddq_f32(x, y); - x = vaddq_f32(x, tmp); - x = vreinterpretq_f32_u32(vorrq_u32( - vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN - return x; -} - -#define c_exp_hi 88.3762626647949f -#define c_exp_lo -88.3762626647949f - -#define c_cephes_LOG2EF 1.44269504088896341 -#define c_cephes_exp_C1 0.693359375 -#define c_cephes_exp_C2 -2.12194440e-4 - -#define c_cephes_exp_p0 1.9875691500E-4 -#define c_cephes_exp_p1 1.3981999507E-3 -#define c_cephes_exp_p2 8.3334519073E-3 -#define c_cephes_exp_p3 4.1665795894E-2 -#define c_cephes_exp_p4 1.6666665459E-1 -#define c_cephes_exp_p5 5.0000001201E-1 - -/* exp() computed for 4 float at once */ -static inline float32x4_t exp_ps(float32x4_t x) { - float32x4_t tmp, fx; - - float32x4_t one = vdupq_n_f32(1); - x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); - x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - uint32x4_t mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); - float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, - c_cephes_exp_p2, c_cephes_exp_p3, - c_cephes_exp_p4, c_cephes_exp_p5}; - float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0); - float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1); - float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2); - float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3); - float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4); - float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5); - - y = vmulq_f32(y, x); - z = vmulq_f32(x, x); - - y = vaddq_f32(y, c1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, one); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); - mm = vshlq_n_s32(mm, 23); - float32x4_t pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; -} - -#define c_minus_cephes_DP1 -0.78515625 -#define c_minus_cephes_DP2 -2.4187564849853515625e-4 -#define c_minus_cephes_DP3 -3.77489497744594108e-8 -#define c_sincof_p0 -1.9515295891E-4 -#define c_sincof_p1 8.3321608736E-3 -#define c_sincof_p2 -1.6666654611E-1 -#define c_coscof_p0 2.443315711809948E-005 -#define c_coscof_p1 -1.388731625493765E-003 -#define c_coscof_p2 4.166664568298827E-002 -#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI - -/* evaluation of 4 sines & cosines at once. - * - * The code is the exact rewriting of the cephes sinf function. - * Precision is excellent as long as x < 8192 (I did not bother to - * take into account the special handling they have for greater values - * -- it does not return garbage for arguments over 8192, though, but - * the extra precision is missing). - * - * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - * surprising but correct result. - * - * Note also that when you compute sin(x), cos(x) is available at - * almost no extra price so both sin_ps and cos_ps make use of - * sincos_ps.. - */ -static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, - float32x4_t *ycos) { - // any x - float32x4_t xmm1, xmm2, xmm3, y; - - uint32x4_t emm2; - - uint32x4_t sign_mask_sin, sign_mask_cos; - sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); - x = vabsq_f32(x); - - /* scale by 4/Pi */ - y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); - - /* store the integer part of y in mm0 */ - emm2 = vcvtq_u32_f32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); - emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); - y = vcvtq_f32_u32(emm2); - - /* get the polynom selection mask - * there is one polynom for 0 <= x <= Pi/4 - * and another one for Pi/4 -#include "common/enforce.h" -#include "framework/data_type.h" -#include "framework/tensor.h" -#include "operators/math/gemm.h" -#include "operators/math/gemm/cblas.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -struct TensorSetConstant { - TensorSetConstant(framework::Tensor *tensor, float value) - : tensor_(tensor), value_(value) {} - template - void apply() const { - auto *begin = tensor_->mutable_data(); - std::fill(begin, begin + tensor_->numel(), static_cast(value_)); - } - framework::Tensor *tensor_; - float value_; -}; - -void SetConstant(framework::Tensor *tensor, float value) { - framework::VisitDataType(framework::ToDataType(tensor->type()), - TensorSetConstant(tensor, value)); -} - -template <> -void MatMul(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - float alpha, framework::Tensor *matrix_out, - float beta, bool relu, float *bias) { - auto dim_a = matrix_a.dims(); - auto dim_b = matrix_b.dims(); - auto dim_out = matrix_out->dims(); - PADDLE_MOBILE_ENFORCE( - dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, - "The input and output of MatMul be matrix"); - - int M = dim_out[0]; - int N = dim_out[1]; - int K = (!trans_a) ? dim_a[1] : dim_a[0]; - int ldb = (!trans_b) ? dim_b[1] : dim_b[0]; - - Gemm gemm; - if (trans_a) { - framework::Tensor matrix_trans; - int numel = matrix_a.numel(); - int m = matrix_a.dims()[0]; - int n = matrix_a.dims()[1]; - float *tmp = (float *)(matrix_a.data()); // NOLINT - float *a = matrix_trans.mutable_data(matrix_a.dims()); - int index = 0; - for (int j = 0; j < n; j++) { - for (int i = 0; i < m; i++) { - a[index++] = tmp[i * n + j]; - } - } - cblas_sgemm(false, trans_b, M, N, K, alpha, a, K, matrix_b.data(), - ldb, beta, matrix_out->data(), N); - } else { - cblas_sgemm(false, trans_b, M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), ldb, beta, matrix_out->data(), - N); - } -} - -void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, float alpha, - framework::Tensor *matrix_out, float beta, bool relu, - framework::Tensor *new_scale, framework::Tensor *new_bias, - int group, float *bias) { - Gemm gemm; - auto dim_a = matrix_a.dims(); - auto dim_b = matrix_b.dims(); - auto dim_out = matrix_out->dims(); - PADDLE_MOBILE_ENFORCE( - dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, - "The input and output of MatMul be matrix"); - - int M = dim_out[0]; - int N = dim_out[1]; - int K = (!trans_a) ? dim_a[1] : dim_a[0]; - -#ifdef _OPENMP - gemm.SgemmWithBn_omp( - M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, - beta, matrix_out->data(), N, relu, - new_scale->data() + group, new_bias->data() + group, bias); -#else - gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N, relu, new_scale->data() + group, - new_bias->data() + group, bias); -#endif -} -void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - framework::Tensor *matrix_out, float *p, std::string mode, - float *bias, float *bias1) { - Gemm gemm; - auto dim_a = matrix_a.dims(); - auto dim_b = matrix_b.dims(); - auto dim_out = matrix_out->dims(); - PADDLE_MOBILE_ENFORCE( - dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, - "The input and output of MatMul be matrix"); - - int M = dim_out[0]; - int N = dim_out[1]; - int K = (!trans_a) ? dim_a[1] : dim_a[0]; - -#ifdef _OPENMP - gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data(), K, - matrix_b.data(), N, matrix_out->data(), - N, p, mode, bias, bias1); -#else - gemm.SgemmWithPRelu(M, N, K, matrix_a.data(), K, - matrix_b.data(), N, matrix_out->data(), N, - p, mode, bias, bias1); -#endif -} - -template -struct ClearTensor { - void operator()(framework::Tensor *tensor) { - auto size = tensor->numel(); - auto *tensor_data = tensor->data(); - memset((void *)tensor_data, 0, sizeof(T) * size); // NOLINT - } -}; - -template -struct RowwiseAdd { - void operator()(const framework::Tensor &input, - const framework::Tensor &vector, framework::Tensor *output) { - auto in_dims = input.dims(); - auto size = input.numel() / in_dims[0]; - PADDLE_MOBILE_ENFORCE((vector.numel() == size), - "vector.numel() must be equal to size."); - PADDLE_MOBILE_ENFORCE((output->dims() == in_dims), - "output->dims() must be equal to in_dims."); - - auto *input_data = input.data(); - auto *out_data = output->data(); - auto *vec_data = vector.data(); - for (int64_t i = 0; i < in_dims[0]; ++i) { - for (int64_t j = 0; j < size; ++j) { - out_data[i * size + j] = input_data[i * size + j] + vec_data[j]; - } - } - } -}; - -template struct RowwiseAdd; -template struct ClearTensor; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/math_function.h b/mobile/src/operators/math/math_function.h deleted file mode 100644 index ccc1a2b931..0000000000 --- a/mobile/src/operators/math/math_function.h +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -void SetConstant(framework::Tensor *tensor, float value); - -template -void MatMul(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, float alpha, - framework::Tensor *matrix_out, float beta, bool relu = false, - Otype *bias = nullptr); - -template -void MatMul(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, float alpha, - framework::Tensor *matrix_out, float beta, bool relu, Otype *bias, - bool addOnRow); - -void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, float alpha, - framework::Tensor *matrix_out, float beta, bool relu, - framework::Tensor *new_scale, framework::Tensor *new_bias, - int group, float *bias = nullptr); - -void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - framework::Tensor *matrix_out, float *p, std::string mode, - float *bias, float *bias1); - -template -struct ClearTensor { - void operator()(framework::Tensor *tensor); -}; - -template -struct RowwiseAdd { - void operator()(const framework::Tensor &input, const framework::Tensor &vec, - framework::Tensor *output); -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/math_function_int8.cpp b/mobile/src/operators/math/math_function_int8.cpp deleted file mode 100644 index 0595a808f0..0000000000 --- a/mobile/src/operators/math/math_function_int8.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "operators/math/gemm.h" -#include "operators/math/math_function.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template <> -void MatMul(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - float alpha, framework::Tensor *matrix_out, - float beta, bool relu, int32_t *bias, - bool addOnRow) { - auto dim_a = matrix_a.dims(); - auto dim_b = matrix_b.dims(); - auto dim_out = matrix_out->dims(); - PADDLE_MOBILE_ENFORCE( - dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, - "The input and output of MatMul be matrix"); - - int32_t M = dim_out[0]; - int32_t N = dim_out[1]; - int32_t K = (!trans_a) ? dim_a[1] : dim_a[0]; - Gemm gemm; - - if (trans_a) { - int32_t numel = matrix_a.numel(); - int32_t m = matrix_a.dims()[0]; - int32_t n = matrix_a.dims()[1]; - int8_t *tmp = (int8_t *)(matrix_a.data()); // NOLINT - int8_t *a = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * numel)); - int32_t index = 0; - for (int32_t j = 0; j < n; j++) { - for (int32_t i = 0; i < m; i++) { - a[index++] = tmp[i * n + j]; - } - } - -#ifdef _OPENMP - if (bias != nullptr) { - gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias, addOnRow); - } else { - gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias, addOnRow); - } -#else - if (bias != nullptr) { - gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias, addOnRow); - } else { - gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias, addOnRow); - } -#endif - } else { -#ifdef _OPENMP - if (bias != nullptr) { - gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias, addOnRow); - } else { - gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias, addOnRow); - } -#else - if (bias != nullptr) { - gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N, relu, bias, addOnRow); - } else { - gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N, relu, bias, addOnRow); - } -#endif - } -} - -template <> -void MatMul(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, - float alpha, framework::Tensor *matrix_out, - float beta, bool relu, int32_t *bias) { - MatMul(matrix_a, trans_a, matrix_b, trans_b, alpha, - matrix_out, beta, relu, bias, false); -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/pad.cpp b/mobile/src/operators/math/pad.cpp deleted file mode 100644 index 49fede1eb3..0000000000 --- a/mobile/src/operators/math/pad.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/math/pad.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -class PadFunctor { - public: - void operator()(const framework::Tensor &input, const int pad_top, - const int pad_bottom, const int pad_left, const int pad_right, - framework::Tensor *output) { - const T *in_data = input.data(); - T *out_data = output->mutable_data(); - // should check output shape is valid for such pad parameters - const framework::DDim &input_shape = input.dims(); - const framework::DDim &output_shape = output->dims(); - // fill output with 0 - memset(out_data, 0, sizeof(T) * output->numel()); - // should make sure the shape of output is match with input - for (int i = 0; i < input_shape[0]; ++i) { - for (int c = 0; c < input_shape[1]; ++c) { - out_data += pad_top * output_shape[3]; - for (int h = 0; h < input_shape[2]; ++h) { - memcpy(out_data + pad_left, in_data, sizeof(T) * input_shape[3]); - out_data += output_shape[3]; - in_data += input_shape[3]; - } - out_data += pad_bottom * output_shape[3]; - } - } - } -}; - -template class PadFunctor; -template class PadFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/pad.h b/mobile/src/operators/math/pad.h deleted file mode 100644 index 9031caf36a..0000000000 --- a/mobile/src/operators/math/pad.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -class PadFunctor { - public: - void operator()(const framework::Tensor &input, const int pad_top, - const int pad_bottom, const int pad_left, const int pad_right, - framework::Tensor *output); -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/poly_util.cpp b/mobile/src/operators/math/poly_util.cpp deleted file mode 100644 index 1cc1e2a403..0000000000 --- a/mobile/src/operators/math/poly_util.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP - -#include "operators/math/poly_util.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -void Array2PointVec(const T* box, const size_t box_size, - std::vector>* vec) { - size_t pts_num = box_size / 2; - vec->resize(pts_num); - for (size_t i = 0; i < pts_num; i++) { - vec->at(i).x = box[2 * i]; - vec->at(i).y = box[2 * i + 1]; - } -} - -template -void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) { - size_t pts_num = box_size / 2; - poly->num_contours = 1; - poly->hole = reinterpret_cast(malloc(sizeof(int))); - poly->hole[0] = 0; - poly->contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); - poly->contour->num_vertices = pts_num; - poly->contour->vertex = - (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); - for (size_t i = 0; i < pts_num; ++i) { - poly->contour->vertex[i].x = box[2 * i]; - poly->contour->vertex[i].y = box[2 * i + 1]; - } -} - -template void Array2Poly(const float* box, const size_t box_size, - gpc::gpc_polygon* poly); - -template -void Poly2PointVec(const gpc::gpc_vertex_list& contour, - std::vector>* vec) { - int pts_num = contour.num_vertices; - vec->resize(pts_num); - for (size_t i = 0; i < pts_num; i++) { - vec->at(i).x = contour.vertex[i].x; - vec->at(i).y = contour.vertex[i].y; - } -} - -template -T GetContourArea(const std::vector>& vec) { - int pts_num = vec.size(); - if (pts_num < 3) return T(0.); - T area = T(0.); - for (size_t i = 0; i < pts_num; ++i) { - area += vec[i].x * vec[(i + 1) % pts_num].y - - vec[i].y * vec[(i + 1) % pts_num].x; - } - return fabs(area / 2.0); -} - -template -T PolyArea(const T* box, const size_t box_size, const bool normalized) { - // If coordinate values are is invalid - // if area size <= 0, return 0. - std::vector> vec; - Array2PointVec(box, box_size, &vec); - return GetContourArea(vec); -} - -template float PolyArea(const float* box, const size_t box_size, - const bool normalized); - -template -T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, - const bool normalized) { - gpc::gpc_polygon poly1; - gpc::gpc_polygon poly2; - Array2Poly(box1, box_size, &poly1); - Array2Poly(box2, box_size, &poly2); - gpc::gpc_polygon respoly; - gpc::gpc_op op = gpc::GPC_INT; - gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly); - - T inter_area = T(0.); - int contour_num = respoly.num_contours; - for (int i = 0; i < contour_num; ++i) { - std::vector> resvec; - Poly2PointVec(respoly.contour[i], &resvec); - inter_area += GetContourArea(resvec); - } - - gpc::gpc_free_polygon(&poly1); - gpc::gpc_free_polygon(&poly2); - gpc::gpc_free_polygon(&respoly); - return inter_area; -} - -template float PolyOverlapArea(const float* box1, const float* box2, - const size_t box_size, const bool normalized); - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/poly_util.h b/mobile/src/operators/math/poly_util.h deleted file mode 100644 index 96951a0ab1..0000000000 --- a/mobile/src/operators/math/poly_util.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP -#pragma once - -#include -#include "operators/math/gpc.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -class Point_ { - public: - // default constructor - Point_() {} - Point_(T _x, T _y) {} - Point_(const Point_& pt) {} - - Point_& operator=(const Point_& pt); - // conversion to another data type - // template operator Point_<_T>() const; - // conversion to the old-style C structures - // operator Vec() const; - - // checks whether the point is inside the specified rectangle - // bool inside(const Rect_& r) const; - T x; //!< x coordinate of the point - T y; //!< y coordinate of the point -}; - -template -void Array2PointVec(const T* box, const size_t box_size, - std::vector>* vec); - -template -void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly); - -template -void Poly2PointVec(const gpc::gpc_vertex_list& contour, - std::vector>* vec); - -template -T GetContourArea(const std::vector>& vec); - -template -T PolyArea(const T* box, const size_t box_size, const bool normalized); - -template -T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, - const bool normalized); - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/pooling.cpp b/mobile/src/operators/math/pooling.cpp deleted file mode 100644 index 46b4453e73..0000000000 --- a/mobile/src/operators/math/pooling.cpp +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#include "operators/math/pooling.h" -namespace paddle_mobile { -namespace operators { -namespace math { - -template -void Pooling

::operator()(const framework::Tensor &input, - const std::vector &kernel_size, - const std::vector &strides, - const std::vector &paddings, - framework::Tensor *output) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output->dims()[1]; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - const int ksize_height = kernel_size[0]; - const int ksize_width = kernel_size[1]; - const int stride_height = strides[0]; - const int stride_width = strides[1]; - const int padding_height = paddings[0]; - const int padding_width = paddings[1]; - - const float *input_data = input.data(); - float *output_data = output->mutable_data(); - const size_t input_spatial_size = input_height * input_width; - const size_t output_spatial_size = output_height * output_width; - - #pragma omp parallel for collapse(2) - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - int channel = i * output_channels + c; - const float *input_ptr = input_data + channel * input_spatial_size; - float *output_ptr = output_data + channel * output_spatial_size; - - for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - - PoolingVal

val; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - val += input_ptr[h * input_width + w]; - } - } - output_ptr[ph * output_width + pw] = val.Value(); - } - } - } - } -} - -template struct Pooling; -template struct Pooling; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // POOL_OP diff --git a/mobile/src/operators/math/pooling.h b/mobile/src/operators/math/pooling.h deleted file mode 100644 index 70280ad0a0..0000000000 --- a/mobile/src/operators/math/pooling.h +++ /dev/null @@ -1,199 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#pragma once - -#include -#include -#include -#include -#include "common/types.h" -#include "framework/tensor.h" -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -struct PoolingVal { - float val; - int count; - PoolingVal() : count(0) { val = -std::numeric_limits::max(); } - inline PoolingVal

&operator+=(const float &x) { - val = std::max(val, x); - ++count; - return *this; - } - inline float Value() { return (count > 0) ? val : 0.f; } - inline float ExclusiveSum(int total) { - return ((count > 0) ? val : 0.f) * total; - } -}; - -template <> -struct PoolingVal { - float val; - int count; - PoolingVal() : val(0.f), count(0) {} - inline PoolingVal &operator+=(const float &x) { - val += x; - ++count; - return *this; - } - inline float Value() { return (count > 0) ? val * (1.f / count) : 0.f; } - inline float ExclusiveSum(int total) { return (count > 0) ? val : 0.f; } -}; - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -template -inline float32x4_t vPoolInitq_f32() { - return vdupq_n_f32(-std::numeric_limits::max()); -} - -template <> -inline float32x4_t vPoolInitq_f32() { - return vdupq_n_f32(0.f); -} - -template -inline float32x2_t vPoolInit_f32() { - return vdup_n_f32(-std::numeric_limits::max()); -} - -template <> -inline float32x2_t vPoolInit_f32() { - return vdup_n_f32(0.f); -} - -template -inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, const float32x4_t &x2) { - return vmaxq_f32(x1, x2); -} - -template <> -inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, - const float32x4_t &x2) { - return vaddq_f32(x1, x2); -} - -template -inline float32x2_t vPoolPre_f32(const float32x2_t &x1, const float32x2_t &x2) { - return vmax_f32(x1, x2); -} - -template <> -inline float32x2_t vPoolPre_f32(const float32x2_t &x1, - const float32x2_t &x2) { - return vadd_f32(x1, x2); -} - -template -inline float32x2_t vpPoolPre_f32(const float32x2_t &x1, const float32x2_t &x2) { - return vpmax_f32(x1, x2); -} - -template <> -inline float32x2_t vpPoolPre_f32(const float32x2_t &x1, - const float32x2_t &x2) { - return vpadd_f32(x1, x2); -} - -template -inline float32x4_t vPoolPostq_f32(const float32x4_t &x, - const float32x4_t &post) { - return x; -} - -template <> -inline float32x4_t vPoolPostq_f32(const float32x4_t &x, - const float32x4_t &post) { - return vmulq_f32(x, post); -} - -template -inline float32x2_t vPoolPost_f32(const float32x2_t &x, - const float32x2_t &post) { - return x; -} - -template <> -inline float32x2_t vPoolPost_f32(const float32x2_t &x, - const float32x2_t &post) { - return vmul_f32(x, post); -} -#endif // __ARM_NEON__ - -template -inline float PoolPre(const float &x1, const float &x2) { - return std::max(x1, x2); -} - -template <> -inline float PoolPre(const float &x1, const float &x2) { - return x1 + x2; -} - -template -inline float PoolPost(const float &x, const float &post) { - return x; -} - -template <> -inline float PoolPost(const float &x, const float &post) { - return x * post; -} - -template -struct Pooling { - void operator()(const framework::Tensor &input, - const std::vector &kernel_size, - const std::vector &strides, - const std::vector &paddings, framework::Tensor *output); -}; - -template -struct Pooling2x2 { - void operator()(const framework::Tensor &input, - const std::vector &paddings, framework::Tensor *output); -}; - -template -struct Pooling3x3 { - void operator()(const framework::Tensor &input, - const std::vector &paddings, const bool exclusive, - framework::Tensor *output); -}; - -template -struct Pooling5x5 { - void operator()(const framework::Tensor &input, - const std::vector &paddings, framework::Tensor *output); -}; - -template -struct Pooling7x7 { - void operator()(const framework::Tensor &input, - const std::vector &paddings, framework::Tensor *output); -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/pooling2x2.cpp b/mobile/src/operators/math/pooling2x2.cpp deleted file mode 100644 index 1d8845ce69..0000000000 --- a/mobile/src/operators/math/pooling2x2.cpp +++ /dev/null @@ -1,791 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - -#include -#include "operators/math/pooling.h" - -// TODO(hjchen2): Optimize Pooling2x2NormalRow and use inline assembly - -namespace paddle_mobile { -namespace operators { -namespace math { - -#define POOLING2X2_NORMAL_BORDER(start, end) \ - for (int w = start; w < end; ++w) { \ - const int w_in_start = -padding_w + w * Stride; \ - const int w_in_end = w_in_start + 2; \ - const int w_start = w_in_start > 0 ? w_in_start : 0; \ - const int w_end = w_in_end < input_w ? w_in_end : input_w; \ - PoolingVal

val; \ - for (int h_in = h_start; h_in < h_end; ++h_in) { \ - for (int w_in = w_start; w_in < w_end; ++w_in) { \ - val += input[h_in * input_w + w_in]; \ - } \ - } \ - output_ptr[w] = val.Value(); \ - } - -template -struct Pooling2x2NormalRowLoadInput { - void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) { - x0[0] = vld1q_f32(input); - x0[1] = vld1q_f32(input + 4); - x1[0] = vextq_f32(x0[0], x0[1], 1); - x1[1] = vextq_f32(x0[1], x0[1], 1); - } -}; - -template -struct Pooling2x2NormalRowLoadInput { - void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) { - float32x4x2_t t0 = vld2q_f32(input); - float32x4x2_t t1 = vld2q_f32(input + 8); - x0[0] = t0.val[0]; - x0[1] = t1.val[0]; - x1[0] = t0.val[1]; - x1[1] = t1.val[1]; - } -}; - -template -inline void Pooling2x2NormalRow(const float *input, const int h_output, - const int input_h, const int input_w, - const int padding_h, const int padding_w, - const int output_w, float *output) { - const int h_in_start = -padding_h + h_output * Stride; - const int h_in_end = h_in_start + 2; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int h_end = h_in_end < input_h ? h_in_end : input_h; - - float *output_ptr = output + h_output * output_w; - if (h_end - h_start <= 0) { - memset(output_ptr, 0, output_w * sizeof(float)); - return; - } - - const int valid_w_start = (padding_w + Stride - 1) / Stride; - const int valid_w_end = (input_w + padding_w - 2) / Stride + 1; - const int valid_w = valid_w_end - valid_w_start; - - // border left - POOLING2X2_NORMAL_BORDER(0, valid_w_start) - // valid w - Pooling2x2NormalRowLoadInput load_input; - int output_tiles = valid_w / 6; - int output_tiles_w = output_tiles * 6; - float32x4_t x0[2], x1[2], y0[2]; - float32x4_t post = vdupq_n_f32(1.f / (2 * (h_end - h_start))); - for (int w = 0; w < output_tiles_w; w += 6) { - int output_offset = valid_w_start + w; - int input_w_offset = output_offset * Stride - padding_w; - y0[0] = vPoolInitq_f32

(); - y0[1] = vPoolInitq_f32

(); - for (int h_in = h_start; h_in < h_end; ++h_in) { - load_input(input + h_in * input_w + input_w_offset, x0, x1); - y0[0] = vPoolPreq_f32

(y0[0], x0[0]); - y0[0] = vPoolPreq_f32

(y0[0], x1[0]); - y0[1] = vPoolPreq_f32

(y0[1], x0[1]); - y0[1] = vPoolPreq_f32

(y0[1], x1[1]); - } - y0[0] = vPoolPostq_f32

(y0[0], post); - y0[1] = vPoolPostq_f32

(y0[1], post); - vst1q_f32(output_ptr + output_offset, y0[0]); - vst1_f32(output_ptr + output_offset + 4, vget_low_f32(y0[1])); - } - // remain valid w - int remain = valid_w - output_tiles_w; - if (remain > 0) { - int remain_start = valid_w_start + output_tiles_w; - int input_w_offset = remain_start * Stride - padding_w; - float *output_ptr0 = output_ptr + remain_start; - y0[0] = vPoolInitq_f32

(); - y0[1] = vPoolInitq_f32

(); - for (int h_in = h_start; h_in < h_end; ++h_in) { - load_input(input + h_in * input_w + input_w_offset, x0, x1); - y0[0] = vPoolPreq_f32

(y0[0], x0[0]); - y0[0] = vPoolPreq_f32

(y0[0], x1[0]); - y0[1] = vPoolPreq_f32

(y0[1], x0[1]); - y0[1] = vPoolPreq_f32

(y0[1], x1[1]); - } - y0[0] = vPoolPostq_f32

(y0[0], post); - y0[1] = vPoolPostq_f32

(y0[1], post); - switch (remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0[0], 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0[0])); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0[0])); - vst1q_lane_f32(output_ptr0 + 2, y0[0], 2); - break; - case 4: - vst1q_f32(output_ptr0, y0[0]); - break; - case 5: - vst1q_f32(output_ptr0, y0[0]); - vst1q_lane_f32(output_ptr0 + 4, y0[1], 0); - break; - } - } - // border right - POOLING2X2_NORMAL_BORDER(valid_w_end, output_w) -} - -template -struct Pooling2x2 { - inline void operator()(const framework::Tensor &input, - const std::vector &paddings, - framework::Tensor *output) { - const float *input_data = input.data(); - float *output_data = output->mutable_data(); - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int padding_h = paddings[0]; - int padding_w = paddings[1]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - int valid_h_start = padding_h; - int valid_h_end = output_h - valid_h_start; - int valid_h = valid_h_end - valid_h_start; - int valid_w_start = padding_w; - int valid_w_end = output_w - valid_w_start; - int valid_w = valid_w_end - valid_w_start; - - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < output->dims()[0]; ++batch) { - for (int c = 0; c < output->dims()[1]; ++c) { - int channel = batch * output->dims()[1] + c; - const float *input_ptr = input_data + channel * image_size; - float *output_ptr = output_data + channel * out_image_size; - // top - for (int h = 0; h < valid_h_start; ++h) { - Pooling2x2NormalRow(input_ptr, h, input_h, input_w, padding_h, - padding_w, output_w, output_ptr); - } - // valid - int output_w_tiles = valid_w / 6; - int output_w_remain = valid_w - output_w_tiles * 6; - for (int h = valid_h_start; h < valid_h_end - 3; h += 4) { - const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - const float *input_ptr3 = input_ptr2 + input_w; - const float *input_ptr4 = input_ptr3 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - float *output_ptr1 = output_ptr0 + output_w; - float *output_ptr2 = output_ptr1 + output_w; - float *output_ptr3 = output_ptr2 + output_w; - // pad left - if (padding_w) { - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 2) { - output_ptr0[w] = 0.f; - output_ptr1[w] = 0.f; - output_ptr2[w] = 0.f; - output_ptr3[w] = 0.f; - } else { - float acc0 = PoolPre

(*input_ptr0, *input_ptr1); - float acc1 = PoolPre

(*input_ptr1, *input_ptr2); - float acc2 = PoolPre

(*input_ptr2, *input_ptr3); - float acc3 = PoolPre

(*input_ptr3, *input_ptr4); - output_ptr0[w] = PoolPost

(acc0, 0.5f); - output_ptr1[w] = PoolPost

(acc1, 0.5f); - output_ptr2[w] = PoolPost

(acc2, 0.5f); - output_ptr3[w] = PoolPost

(acc3, 0.5f); - } - } - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - output_ptr2 += valid_w_start; - output_ptr3 += valid_w_start; - } - // valid - float32x4x2_t x0, x1, q0; - float32x4x2_t y0, y1; - float32x4_t post = vdupq_n_f32(0.25f); - for (int loop = 0; loop < output_w_tiles; ++loop) { - x0.val[0] = vld1q_f32(input_ptr0); - x0.val[1] = vld1q_f32(input_ptr0 + 4); - x1.val[0] = vld1q_f32(input_ptr1); - x1.val[1] = vld1q_f32(input_ptr1 + 4); - q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - y0.val[0] = vPoolPreq_f32

(x0.val[0], q0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], q0.val[1]); - - q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1); - q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1); - y1.val[0] = vPoolPreq_f32

(x1.val[0], q0.val[0]); - y1.val[1] = vPoolPreq_f32

(x1.val[1], q0.val[1]); - y0.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); - y0.val[1] = vPoolPreq_f32

(y0.val[1], y1.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr0, y0.val[0]); - vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); - - x0.val[0] = vld1q_f32(input_ptr2); - x0.val[1] = vld1q_f32(input_ptr2 + 4); - x1.val[0] = vld1q_f32(input_ptr3); - x1.val[1] = vld1q_f32(input_ptr3 + 4); - q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - y0.val[0] = vPoolPreq_f32

(x0.val[0], q0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], q0.val[1]); - y1.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); - y1.val[1] = vPoolPreq_f32

(y1.val[1], y0.val[1]); - y1.val[0] = vPoolPostq_f32

(y1.val[0], post); - y1.val[1] = vPoolPostq_f32

(y1.val[1], post); - vst1q_f32(output_ptr1, y1.val[0]); - vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1])); - - q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1); - q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1); - y1.val[0] = vPoolPreq_f32

(x1.val[0], q0.val[0]); - y1.val[1] = vPoolPreq_f32

(x1.val[1], q0.val[1]); - y0.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); - y0.val[1] = vPoolPreq_f32

(y0.val[1], y1.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr2, y0.val[0]); - vst1_f32(output_ptr2 + 4, vget_low_f32(y0.val[1])); - - x0.val[0] = vld1q_f32(input_ptr4); - x0.val[1] = vld1q_f32(input_ptr4 + 4); - q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - y1.val[0] = vPoolPreq_f32

(y1.val[0], x0.val[0]); - y1.val[0] = vPoolPreq_f32

(y1.val[0], q0.val[0]); - y1.val[1] = vPoolPreq_f32

(y1.val[1], x0.val[1]); - y1.val[1] = vPoolPreq_f32

(y1.val[1], q0.val[1]); - y1.val[0] = vPoolPostq_f32

(y1.val[0], post); - y1.val[1] = vPoolPostq_f32

(y1.val[1], post); - vst1q_f32(output_ptr3, y1.val[0]); - vst1_f32(output_ptr3 + 4, vget_low_f32(y1.val[1])); - - input_ptr0 += 6; - input_ptr1 += 6; - input_ptr2 += 6; - input_ptr3 += 6; - input_ptr4 += 6; - output_ptr0 += 6; - output_ptr1 += 6; - output_ptr2 += 6; - output_ptr3 += 6; - } - // remain width - if (output_w_remain > 0) { - float32x4x2_t y2, y3; - x0.val[0] = vld1q_f32(input_ptr0); - x0.val[1] = vld1q_f32(input_ptr0 + 4); - x1.val[0] = vld1q_f32(input_ptr1); - x1.val[1] = vld1q_f32(input_ptr1 + 4); - q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - y0.val[0] = vPoolPreq_f32

(x0.val[0], q0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], q0.val[1]); - - q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1); - q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1); - y1.val[0] = vPoolPreq_f32

(x1.val[0], q0.val[0]); - y1.val[1] = vPoolPreq_f32

(x1.val[1], q0.val[1]); - y0.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); - y0.val[1] = vPoolPreq_f32

(y0.val[1], y1.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - - x0.val[0] = vld1q_f32(input_ptr2); - x0.val[1] = vld1q_f32(input_ptr2 + 4); - x1.val[0] = vld1q_f32(input_ptr3); - x1.val[1] = vld1q_f32(input_ptr3 + 4); - q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - y2.val[0] = vPoolPreq_f32

(x0.val[0], q0.val[0]); - y2.val[1] = vPoolPreq_f32

(x0.val[1], q0.val[1]); - y1.val[0] = vPoolPreq_f32

(y1.val[0], y2.val[0]); - y1.val[1] = vPoolPreq_f32

(y1.val[1], y2.val[1]); - y1.val[0] = vPoolPostq_f32

(y1.val[0], post); - y1.val[1] = vPoolPostq_f32

(y1.val[1], post); - - q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1); - q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1); - y3.val[0] = vPoolPreq_f32

(x1.val[0], q0.val[0]); - y3.val[1] = vPoolPreq_f32

(x1.val[1], q0.val[1]); - y2.val[0] = vPoolPreq_f32

(y2.val[0], y3.val[0]); - y2.val[1] = vPoolPreq_f32

(y2.val[1], y3.val[1]); - y2.val[0] = vPoolPostq_f32

(y2.val[0], post); - y2.val[1] = vPoolPostq_f32

(y2.val[1], post); - - x0.val[0] = vld1q_f32(input_ptr4); - x0.val[1] = vld1q_f32(input_ptr4 + 4); - q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - y3.val[0] = vPoolPreq_f32

(y3.val[0], x0.val[0]); - y3.val[0] = vPoolPreq_f32

(y3.val[0], q0.val[0]); - y3.val[1] = vPoolPreq_f32

(y3.val[1], x0.val[1]); - y3.val[1] = vPoolPreq_f32

(y3.val[1], q0.val[1]); - y3.val[0] = vPoolPostq_f32

(y3.val[0], post); - y3.val[1] = vPoolPostq_f32

(y3.val[1], post); - - switch (output_w_remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0.val[0], 0); - vst1q_lane_f32(output_ptr1, y1.val[0], 0); - vst1q_lane_f32(output_ptr2, y2.val[0], 0); - vst1q_lane_f32(output_ptr3, y3.val[0], 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1_f32(output_ptr1, vget_low_f32(y1.val[0])); - vst1_f32(output_ptr2, vget_low_f32(y2.val[0])); - vst1_f32(output_ptr3, vget_low_f32(y3.val[0])); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1_f32(output_ptr1, vget_low_f32(y1.val[0])); - vst1_f32(output_ptr2, vget_low_f32(y2.val[0])); - vst1_f32(output_ptr3, vget_low_f32(y3.val[0])); - vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2); - vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2); - vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2); - vst1q_lane_f32(output_ptr3 + 2, y3.val[0], 2); - break; - case 4: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_f32(output_ptr1, y1.val[0]); - vst1q_f32(output_ptr2, y2.val[0]); - vst1q_f32(output_ptr3, y3.val[0]); - break; - case 5: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_f32(output_ptr1, y1.val[0]); - vst1q_f32(output_ptr2, y2.val[0]); - vst1q_f32(output_ptr3, y3.val[0]); - vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0); - vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0); - vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0); - vst1q_lane_f32(output_ptr3 + 4, y3.val[1], 0); - break; - } - input_ptr0 += output_w_remain; - input_ptr1 += output_w_remain; - input_ptr2 += output_w_remain; - input_ptr3 += output_w_remain; - input_ptr4 += output_w_remain; - output_ptr0 += output_w_remain; - output_ptr1 += output_w_remain; - output_ptr2 += output_w_remain; - output_ptr3 += output_w_remain; - } - // pad right - if (padding_w) { - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 2 - (padding_w + input_w); - if (padding >= 2) { - *output_ptr0 = 0.f; - *output_ptr1 = 0.f; - *output_ptr2 = 0.f; - *output_ptr3 = 0.f; - } else { - float acc0 = PoolPre

(*input_ptr0, *input_ptr1); - float acc1 = PoolPre

(*input_ptr1, *input_ptr2); - float acc2 = PoolPre

(*input_ptr2, *input_ptr3); - float acc3 = PoolPre

(*input_ptr3, *input_ptr4); - *output_ptr0 = PoolPost

(acc0, 0.5f); - *output_ptr1 = PoolPost

(acc1, 0.5f); - *output_ptr2 = PoolPost

(acc2, 0.5f); - *output_ptr3 = PoolPost

(acc3, 0.5f); - } - output_ptr0++; - output_ptr1++; - output_ptr2++; - output_ptr3++; - } - } - } - // remain height - int start_h = valid_h_start + (valid_h & 0xFFFFFFFC); - for (int h = start_h; h < valid_h_end; ++h) { - const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - // pad left - if (padding_w) { - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 2) { - output_ptr0[w] = 0.f; - } else { - float acc0 = PoolPre

(*input_ptr0, *input_ptr1); - output_ptr0[w] = PoolPost

(acc0, 0.5f); - } - } - output_ptr0 += valid_w_start; - } - // valid - float32x4x2_t x0, x1, q0, y0; - float32x4_t post = vdupq_n_f32(0.25f); - for (int loop = 0; loop < output_w_tiles; ++loop) { - x0.val[0] = vld1q_f32(input_ptr0); - x0.val[1] = vld1q_f32(input_ptr0 + 4); - x1.val[0] = vld1q_f32(input_ptr1); - x1.val[1] = vld1q_f32(input_ptr1 + 4); - q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - y0.val[0] = vPoolPreq_f32

(x0.val[0], q0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], q0.val[1]); - - q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1); - q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1); - y0.val[0] = vPoolPreq_f32

(y0.val[0], x1.val[0]); - y0.val[1] = vPoolPreq_f32

(y0.val[1], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(y0.val[0], q0.val[0]); - y0.val[1] = vPoolPreq_f32

(y0.val[1], q0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr0, y0.val[0]); - vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); - - input_ptr0 += 6; - input_ptr1 += 6; - output_ptr0 += 6; - } - // remain width - if (output_w_remain > 0) { - x0.val[0] = vld1q_f32(input_ptr0); - x0.val[1] = vld1q_f32(input_ptr0 + 4); - x1.val[0] = vld1q_f32(input_ptr1); - x1.val[1] = vld1q_f32(input_ptr1 + 4); - q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - y0.val[0] = vPoolPreq_f32

(x0.val[0], q0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], q0.val[1]); - - q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1); - q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1); - y0.val[0] = vPoolPreq_f32

(y0.val[0], x1.val[0]); - y0.val[1] = vPoolPreq_f32

(y0.val[1], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(y0.val[0], q0.val[0]); - y0.val[1] = vPoolPreq_f32

(y0.val[1], q0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - - switch (output_w_remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0.val[0], 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2); - break; - case 4: - vst1q_f32(output_ptr0, y0.val[0]); - break; - case 5: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0); - break; - } - input_ptr0 += output_w_remain; - input_ptr1 += output_w_remain; - output_ptr0 += output_w_remain; - } - // pad right - if (padding_w) { - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 2 - (padding_w + input_w); - if (padding >= 2) { - *output_ptr0 = 0.f; - } else { - float acc0 = PoolPre

(*input_ptr0, *input_ptr1); - *output_ptr0 = PoolPost

(acc0, 0.5f); - } - output_ptr0++; - } - } - } - // bottom - for (int h = valid_h_end; h < output_h; ++h) { - Pooling2x2NormalRow(input_ptr, h, input_h, input_w, padding_h, - padding_w, output_w, output_ptr); - } - } - } - } -}; - -template -struct Pooling2x2 { - inline void operator()(const framework::Tensor &input, - const std::vector &paddings, - framework::Tensor *output) { - const float *input_data = input.data(); - float *output_data = output->mutable_data(); - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int padding_h = paddings[0]; - int padding_w = paddings[1]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - int valid_h_start = (padding_h + 1) / 2; - int valid_h_end = (input_h + padding_h) / 2; - int valid_h = valid_h_end - valid_h_start; - int valid_w_start = (padding_w + 1) / 2; - int valid_w_end = (input_w + padding_w) / 2; - int valid_w = valid_w_end - valid_w_start; - - bool ceil_mode = (((input_h + 2 * padding_h) / 2) < output_h) || - (((input_w + 2 * padding_w) / 2) < output_w); - int padding_b = - padding_h + (ceil_mode ? 2 * output_h - (input_h + 2 * padding_h) : 0); - int padding_r = - padding_w + (ceil_mode ? 2 * output_w - (input_w + 2 * padding_w) : 0); - - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < output->dims()[0]; ++batch) { - for (int c = 0; c < output->dims()[1]; ++c) { - int channel = batch * output->dims()[1] + c; - const float *input_ptr = input_data + channel * image_size; - float *output_ptr = output_data + channel * out_image_size; - // top - for (int h = 0; h < valid_h_start; ++h) { - Pooling2x2NormalRow(input_ptr, h, input_h, input_w, padding_h, - padding_w, output_w, output_ptr); - } - // valid - int output_w_tiles = valid_w / 4; - int output_w_remain = valid_w - output_w_tiles * 4; - for (int h = valid_h_start; h < valid_h_end - 1; h += 2) { - const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - const float *input_ptr3 = input_ptr2 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - float *output_ptr1 = output_ptr0 + output_w; - // pad left - if (padding_w) { - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w * 2; - if (padding >= 2) { - output_ptr0[w] = 0.f; - output_ptr1[w] = 0.f; - } else { - float acc0 = PoolPre

(*input_ptr0, *input_ptr1); - float acc1 = PoolPre

(*input_ptr2, *input_ptr3); - output_ptr0[w] = PoolPost

(acc0, 0.5f); - output_ptr1[w] = PoolPost

(acc1, 0.5f); - } - } - input_ptr0 += (padding_w & 0x1); - input_ptr1 += (padding_w & 0x1); - input_ptr2 += (padding_w & 0x1); - input_ptr3 += (padding_w & 0x1); - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - } - // valid - float32x4x2_t x0, x1, x2, x3; - float32x4_t y0, y1; - float32x4_t post = vdupq_n_f32(0.25f); - for (int loop = 0; loop < output_w_tiles; ++loop) { - x0 = vld2q_f32(input_ptr0); - x1 = vld2q_f32(input_ptr1); - x2 = vld2q_f32(input_ptr2); - x3 = vld2q_f32(input_ptr3); - y0 = vPoolPreq_f32

(x0.val[0], x0.val[1]); - y1 = vPoolPreq_f32

(x2.val[0], x2.val[1]); - y0 = vPoolPreq_f32

(y0, x1.val[0]); - y1 = vPoolPreq_f32

(y1, x3.val[0]); - y0 = vPoolPreq_f32

(y0, x1.val[1]); - y1 = vPoolPreq_f32

(y1, x3.val[1]); - y0 = vPoolPostq_f32

(y0, post); - y1 = vPoolPostq_f32

(y1, post); - vst1q_f32(output_ptr0, y0); - vst1q_f32(output_ptr1, y1); - - input_ptr0 += 8; - input_ptr1 += 8; - input_ptr2 += 8; - input_ptr3 += 8; - output_ptr0 += 4; - output_ptr1 += 4; - } - // remain width - if (output_w_remain > 0) { - x0 = vld2q_f32(input_ptr0); - x1 = vld2q_f32(input_ptr1); - x2 = vld2q_f32(input_ptr2); - x3 = vld2q_f32(input_ptr3); - y0 = vPoolPreq_f32

(x0.val[0], x0.val[1]); - y1 = vPoolPreq_f32

(x2.val[0], x2.val[1]); - y0 = vPoolPreq_f32

(y0, x1.val[0]); - y1 = vPoolPreq_f32

(y1, x3.val[0]); - y0 = vPoolPreq_f32

(y0, x1.val[1]); - y1 = vPoolPreq_f32

(y1, x3.val[1]); - y0 = vPoolPostq_f32

(y0, post); - y1 = vPoolPostq_f32

(y1, post); - - switch (output_w_remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0, 0); - vst1q_lane_f32(output_ptr1, y1, 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0)); - vst1_f32(output_ptr1, vget_low_f32(y1)); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0)); - vst1q_lane_f32(output_ptr0 + 2, y0, 2); - vst1_f32(output_ptr1, vget_low_f32(y1)); - vst1q_lane_f32(output_ptr1 + 2, y1, 2); - break; - } - input_ptr0 += 2 * output_w_remain; - input_ptr1 += 2 * output_w_remain; - input_ptr2 += 2 * output_w_remain; - input_ptr3 += 2 * output_w_remain; - output_ptr0 += output_w_remain; - output_ptr1 += output_w_remain; - } - // pad right - if (padding_r) { - for (int w = valid_w_end; w < output_w; ++w) { - int padding = 2 * w + 2 - (padding_w + input_w); - if (padding >= 2) { - *output_ptr0 = 0.f; - *output_ptr1 = 0.f; - } else { - float acc0 = PoolPre

(*input_ptr0, *input_ptr1); - float acc1 = PoolPre

(*input_ptr2, *input_ptr3); - *output_ptr0 = PoolPost

(acc0, 0.5f); - *output_ptr1 = PoolPost

(acc1, 0.5f); - } - output_ptr0++; - output_ptr1++; - } - } - } - // remain height - int start_h = valid_h_start + (valid_h & 0xfffffffe); - for (int h = start_h; h < valid_h_end; ++h) { - const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - // pad left - if (padding_w) { - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - 2 * w; - if (padding >= 2) { - output_ptr0[w] = 0.f; - } else { - float acc0 = PoolPre

(*input_ptr0, *input_ptr1); - output_ptr0[w] = PoolPost

(acc0, 0.5f); - } - } - input_ptr0 += (padding_w & 0x1); - input_ptr1 += (padding_w & 0x1); - output_ptr0 += valid_w_start; - } - // valid - float32x4x2_t x0, x1; - float32x4_t y0; - float32x4_t post = vdupq_n_f32(0.25f); - for (int loop = 0; loop < output_w_tiles; ++loop) { - x0 = vld2q_f32(input_ptr0); - x1 = vld2q_f32(input_ptr1); - y0 = vPoolPreq_f32

(x0.val[0], x0.val[1]); - y0 = vPoolPreq_f32

(y0, x1.val[0]); - y0 = vPoolPreq_f32

(y0, x1.val[1]); - y0 = vPoolPostq_f32

(y0, post); - vst1q_f32(output_ptr0, y0); - - input_ptr0 += 8; - input_ptr1 += 8; - output_ptr0 += 4; - } - // remain width - if (output_w_remain > 0) { - x0 = vld2q_f32(input_ptr0); - x1 = vld2q_f32(input_ptr1); - y0 = vPoolPreq_f32

(x0.val[0], x0.val[1]); - y0 = vPoolPreq_f32

(y0, x1.val[0]); - y0 = vPoolPreq_f32

(y0, x1.val[1]); - y0 = vPoolPostq_f32

(y0, post); - - switch (output_w_remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0, 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0)); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0)); - vst1q_lane_f32(output_ptr0 + 2, y0, 2); - break; - } - input_ptr0 += 2 * output_w_remain; - input_ptr1 += 2 * output_w_remain; - output_ptr0 += output_w_remain; - } - // pad right - if (padding_r) { - for (int w = valid_w_end; w < output_w; ++w) { - int padding = 2 * w + 2 - (padding_w + input_w); - if (padding >= 2) { - *output_ptr0 = 0.f; - } else { - float acc0 = PoolPre

(*input_ptr0, *input_ptr1); - *output_ptr0 = PoolPost

(acc0, 0.5f); - } - output_ptr0++; - } - } - } - // bottom - for (int h = valid_h_end; h < output_h; ++h) { - Pooling2x2NormalRow(input_ptr, h, input_h, input_w, padding_h, - padding_w, output_w, output_ptr); - } - } - } - } -}; - -template struct Pooling2x2; -template struct Pooling2x2; -template struct Pooling2x2; -template struct Pooling2x2; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // __ARM_NEON__ -#endif // POOL_OP diff --git a/mobile/src/operators/math/pooling3x3.cpp b/mobile/src/operators/math/pooling3x3.cpp deleted file mode 100644 index 3303dabb8d..0000000000 --- a/mobile/src/operators/math/pooling3x3.cpp +++ /dev/null @@ -1,1317 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - -#include -#include "operators/math/pooling.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -#define POOLING3X3_NORMAL_BORDER(start, end, exclusive) \ - for (int w = start; w < end; ++w) { \ - const int w_in_start = -padding_w + w * Stride; \ - const int w_in_end = w_in_start + 3; \ - const int w_start = w_in_start > 0 ? w_in_start : 0; \ - const int w_end = w_in_end < input_w ? w_in_end : input_w; \ - PoolingVal

val; \ - for (int h_in = h_start; h_in < h_end; ++h_in) { \ - for (int w_in = w_start; w_in < w_end; ++w_in) { \ - val += input[h_in * input_w + w_in]; \ - } \ - } \ - output_ptr[w] = exclusive ? val.Value() : val.ExclusiveSum(9) / 9.f; \ - } - -template -struct Pooling3x3NormalRowLoadInput { - inline void operator()(const float *input, float32x4x2_t &x0, // NOLINT - float32x4x2_t &x1, float32x4x2_t &x2, // NOLINT - float32x4x2_t &y0) { // NOLINT - x0.val[0] = vld1q_f32(input); - x0.val[1] = vld1q_f32(input + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - y0.val[0] = vPoolPreq_f32

(x1.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x1.val[1], y0.val[1]); - y0.val[0] = vPoolPreq_f32

(x2.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x2.val[1], y0.val[1]); - } -}; - -template -struct Pooling3x3NormalRowLoadInput { - inline void operator()(const float *input, float32x4x2_t &x0, // NOLINT - float32x4x2_t &x1, float32x4x2_t &x2, // NOLINT - float32x4x2_t &y0) { // NOLINT - x0 = vld2q_f32(input); - x1 = vld2q_f32(input + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - } -}; - -template -inline void Pooling3x3NormalRow(const float *input, const int h_output, - const int input_h, const int input_w, - const int padding_h, const int padding_w, - const int output_w, const bool exclusive, - float *output) { - const int h_in_start = -padding_h + h_output * Stride; - const int h_in_end = h_in_start + 3; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int h_end = h_in_end < input_h ? h_in_end : input_h; - - float *output_ptr = output + h_output * output_w; - if (h_end - h_start <= 0) { - memset(output_ptr, 0, output_w * sizeof(float)); - return; - } - - const int valid_w_start = (padding_w + Stride - 1) / Stride; - const int valid_w_end = (input_w + padding_w - 3) / Stride + 1; - const int valid_w = valid_w_end - valid_w_start; - - // border left - POOLING3X3_NORMAL_BORDER(0, valid_w_start, exclusive) - // middle - int output_tiles = (valid_w_end - valid_w_start) / 6; - int output_tiles_w = output_tiles * 6; - Pooling3x3NormalRowLoadInput PoolingCompute; - float32x4x2_t x0, x1, x2, y0; - float32x4_t post = exclusive ? vdupq_n_f32(1.f / (3 * (h_end - h_start))) - : vdupq_n_f32(1.f / 9); - for (int w = 0; w < output_tiles_w; w += 6) { - int output_offset = valid_w_start + w; - int input_w_offset = output_offset * Stride - padding_w; - y0.val[0] = vPoolInitq_f32

(); - y0.val[1] = vPoolInitq_f32

(); - for (int h_in = h_start; h_in < h_end; ++h_in) { - PoolingCompute(input + h_in * input_w + input_w_offset, x0, x1, x2, y0); - } - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr + output_offset, y0.val[0]); - vst1_f32(output_ptr + output_offset + 4, vget_low_f32(y0.val[1])); - } - int remain = valid_w - output_tiles_w; - if (remain > 0) { - int remain_start = valid_w_start + output_tiles_w; - int input_w_offset = remain_start * Stride - padding_w; - float *output_ptr0 = output_ptr + remain_start; - y0.val[0] = vPoolInitq_f32

(); - y0.val[1] = vPoolInitq_f32

(); - for (int h_in = h_start; h_in < h_end; ++h_in) { - PoolingCompute(input + h_in * input_w + input_w_offset, x0, x1, x2, y0); - } - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - switch (remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0.val[0], 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2); - break; - case 4: - vst1q_f32(output_ptr0, y0.val[0]); - break; - case 5: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0); - break; - } - } - // border right - POOLING3X3_NORMAL_BORDER(valid_w_end, output_w, exclusive) -} - -template -struct Pooling3x3 { - inline void operator()(const framework::Tensor &input, - const std::vector &paddings, const bool exclusive, - framework::Tensor *output) { - const float *input_data = input.data(); - float *output_data = output->mutable_data(); - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int padding_h = paddings[0]; - int padding_w = paddings[1]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - int valid_h_start = padding_h; - int valid_h = input_h - 2; - int valid_h_end = valid_h_start + valid_h; - int valid_w_start = padding_w; - int valid_w = input_w - 2; - int valid_w_end = valid_w_start + valid_w; - - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < output->dims()[0]; ++batch) { - for (int c = 0; c < output->dims()[1]; ++c) { - int channel = batch * output->dims()[1] + c; - const float *input_ptr = input_data + channel * image_size; - float *output_ptr = output_data + channel * out_image_size; - // top - for (int h = 0; h < valid_h_start; ++h) { - Pooling3x3NormalRow(input_ptr, h, input_h, input_w, padding_h, - padding_w, output_w, exclusive, output_ptr); - } - // valid - int output_w_tiles = valid_w / 6; - int output_w_remain = valid_w - output_w_tiles * 6; - for (int h = valid_h_start; h < valid_h_end - 3; h += 4) { - const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - const float *input_ptr3 = input_ptr2 + input_w; - const float *input_ptr4 = input_ptr3 + input_w; - const float *input_ptr5 = input_ptr4 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - float *output_ptr1 = output_ptr0 + output_w; - float *output_ptr2 = output_ptr1 + output_w; - float *output_ptr3 = output_ptr2 + output_w; - // pad left - if (padding_w) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t row3 = vld1_f32(input_ptr3); - float32x2_t row4 = vld1_f32(input_ptr4); - float32x2_t row5 = vld1_f32(input_ptr5); - float32x2_t pad0 = vPoolInit_f32

(); - float32x2_t acc0, acc1, acc2, acc3, acc12, acc34, post; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 3) { - output_ptr0[w] = 0.f; - output_ptr1[w] = 0.f; - output_ptr2[w] = 0.f; - output_ptr3[w] = 0.f; - } else { - post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding))) - : vdup_n_f32(1.f / 9); - acc12 = vPoolPre_f32

(row1, row2); - acc34 = vPoolPre_f32

(row3, row4); - acc0 = vPoolPre_f32

(row0, acc12); - acc1 = vPoolPre_f32

(row3, acc12); - acc2 = vPoolPre_f32

(row2, acc34); - acc3 = vPoolPre_f32

(row5, acc34); - acc0 = vpPoolPre_f32

(acc0, acc0); - acc1 = vpPoolPre_f32

(acc1, acc1); - acc2 = vpPoolPre_f32

(acc2, acc2); - acc3 = vpPoolPre_f32

(acc3, acc3); - acc0 = vPoolPost_f32

(acc0, post); - acc1 = vPoolPost_f32

(acc1, post); - acc2 = vPoolPost_f32

(acc2, post); - acc3 = vPoolPost_f32

(acc3, post); - vst1_lane_f32(output_ptr0 + w, acc0, 0); - vst1_lane_f32(output_ptr1 + w, acc1, 0); - vst1_lane_f32(output_ptr2 + w, acc2, 0); - vst1_lane_f32(output_ptr3 + w, acc3, 0); - row0 = vext_f32(pad0, row0, 1); - row1 = vext_f32(pad0, row1, 1); - row2 = vext_f32(pad0, row2, 1); - row3 = vext_f32(pad0, row3, 1); - row4 = vext_f32(pad0, row4, 1); - row5 = vext_f32(pad0, row5, 1); - } - } - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - output_ptr2 += valid_w_start; - output_ptr3 += valid_w_start; - } - // valid - float32x4x2_t x0, x1, x2; - float32x4x2_t y0, y1, y2; - float32x4_t post = vdupq_n_f32(1.f / 9); - for (int loop = 0; loop < output_w_tiles; ++loop) { - x0.val[0] = vld1q_f32(input_ptr0); - x0.val[1] = vld1q_f32(input_ptr0 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - - x0.val[0] = vld1q_f32(input_ptr1); - x0.val[1] = vld1q_f32(input_ptr1 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y1.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y1.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(y1.val[1], y0.val[1]); - - x0.val[0] = vld1q_f32(input_ptr2); - x0.val[1] = vld1q_f32(input_ptr2 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y2.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y2.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y1.val[0] = vPoolPreq_f32

(y2.val[0], y1.val[0]); - y1.val[1] = vPoolPreq_f32

(y2.val[1], y1.val[1]); - y0.val[0] = vPoolPreq_f32

(y2.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(y2.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr0, y0.val[0]); - vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); - - x0.val[0] = vld1q_f32(input_ptr3); - x0.val[1] = vld1q_f32(input_ptr3 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y1.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); - y1.val[1] = vPoolPreq_f32

(y0.val[1], y1.val[1]); - y2.val[0] = vPoolPreq_f32

(y0.val[0], y2.val[0]); - y2.val[1] = vPoolPreq_f32

(y0.val[1], y2.val[1]); - y1.val[0] = vPoolPostq_f32

(y1.val[0], post); - y1.val[1] = vPoolPostq_f32

(y1.val[1], post); - vst1q_f32(output_ptr1, y1.val[0]); - vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1])); - - x0.val[0] = vld1q_f32(input_ptr4); - x0.val[1] = vld1q_f32(input_ptr4 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - y2.val[0] = vPoolPreq_f32

(x0.val[0], y2.val[0]); - y2.val[1] = vPoolPreq_f32

(x0.val[1], y2.val[1]); - y2.val[0] = vPoolPostq_f32

(y2.val[0], post); - y2.val[1] = vPoolPostq_f32

(y2.val[1], post); - vst1q_f32(output_ptr2, y2.val[0]); - vst1_f32(output_ptr2 + 4, vget_low_f32(y2.val[1])); - - x0.val[0] = vld1q_f32(input_ptr5); - x0.val[1] = vld1q_f32(input_ptr5 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr3, y0.val[0]); - vst1_f32(output_ptr3 + 4, vget_low_f32(y0.val[1])); - - input_ptr0 += 6; - input_ptr1 += 6; - input_ptr2 += 6; - input_ptr3 += 6; - input_ptr4 += 6; - input_ptr5 += 6; - output_ptr0 += 6; - output_ptr1 += 6; - output_ptr2 += 6; - output_ptr3 += 6; - } - // remain width - if (output_w_remain > 0) { - float32x4x2_t y3; - x0.val[0] = vld1q_f32(input_ptr0); - x0.val[1] = vld1q_f32(input_ptr0 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - - x0.val[0] = vld1q_f32(input_ptr1); - x0.val[1] = vld1q_f32(input_ptr1 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y1.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y1.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(y1.val[1], y0.val[1]); - - x0.val[0] = vld1q_f32(input_ptr2); - x0.val[1] = vld1q_f32(input_ptr2 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y2.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y2.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y1.val[0] = vPoolPreq_f32

(y2.val[0], y1.val[0]); - y1.val[1] = vPoolPreq_f32

(y2.val[1], y1.val[1]); - y0.val[0] = vPoolPreq_f32

(y2.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(y2.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - - x0.val[0] = vld1q_f32(input_ptr3); - x0.val[1] = vld1q_f32(input_ptr3 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y3.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y3.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y1.val[0] = vPoolPreq_f32

(y3.val[0], y1.val[0]); - y1.val[1] = vPoolPreq_f32

(y3.val[1], y1.val[1]); - y2.val[0] = vPoolPreq_f32

(y3.val[0], y2.val[0]); - y2.val[1] = vPoolPreq_f32

(y3.val[1], y2.val[1]); - y1.val[0] = vPoolPostq_f32

(y1.val[0], post); - y1.val[1] = vPoolPostq_f32

(y1.val[1], post); - - x0.val[0] = vld1q_f32(input_ptr4); - x0.val[1] = vld1q_f32(input_ptr4 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y3.val[0] = vPoolPreq_f32

(x0.val[0], y3.val[0]); - y3.val[1] = vPoolPreq_f32

(x0.val[1], y3.val[1]); - y2.val[0] = vPoolPreq_f32

(x0.val[0], y2.val[0]); - y2.val[1] = vPoolPreq_f32

(x0.val[1], y2.val[1]); - y2.val[0] = vPoolPostq_f32

(y2.val[0], post); - y2.val[1] = vPoolPostq_f32

(y2.val[1], post); - - x0.val[0] = vld1q_f32(input_ptr5); - x0.val[1] = vld1q_f32(input_ptr5 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y3.val[0] = vPoolPreq_f32

(x0.val[0], y3.val[0]); - y3.val[1] = vPoolPreq_f32

(x0.val[1], y3.val[1]); - y3.val[0] = vPoolPostq_f32

(y3.val[0], post); - y3.val[1] = vPoolPostq_f32

(y3.val[1], post); - - switch (output_w_remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0.val[0], 0); - vst1q_lane_f32(output_ptr1, y1.val[0], 0); - vst1q_lane_f32(output_ptr2, y2.val[0], 0); - vst1q_lane_f32(output_ptr3, y3.val[0], 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1_f32(output_ptr1, vget_low_f32(y1.val[0])); - vst1_f32(output_ptr2, vget_low_f32(y2.val[0])); - vst1_f32(output_ptr3, vget_low_f32(y3.val[0])); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1_f32(output_ptr1, vget_low_f32(y1.val[0])); - vst1_f32(output_ptr2, vget_low_f32(y2.val[0])); - vst1_f32(output_ptr3, vget_low_f32(y3.val[0])); - vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2); - vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2); - vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2); - vst1q_lane_f32(output_ptr3 + 2, y3.val[0], 2); - break; - case 4: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_f32(output_ptr1, y1.val[0]); - vst1q_f32(output_ptr2, y2.val[0]); - vst1q_f32(output_ptr3, y3.val[0]); - break; - case 5: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_f32(output_ptr1, y1.val[0]); - vst1q_f32(output_ptr2, y2.val[0]); - vst1q_f32(output_ptr3, y3.val[0]); - vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0); - vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0); - vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0); - vst1q_lane_f32(output_ptr3 + 4, y3.val[1], 0); - break; - } - input_ptr0 += output_w_remain; - input_ptr1 += output_w_remain; - input_ptr2 += output_w_remain; - input_ptr3 += output_w_remain; - input_ptr4 += output_w_remain; - input_ptr5 += output_w_remain; - output_ptr0 += output_w_remain; - output_ptr1 += output_w_remain; - output_ptr2 += output_w_remain; - output_ptr3 += output_w_remain; - } - // pad right - if (padding_w) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t row3 = vld1_f32(input_ptr3); - float32x2_t row4 = vld1_f32(input_ptr4); - float32x2_t row5 = vld1_f32(input_ptr5); - float32x2_t pad0 = vPoolInit_f32

(); - float32x2_t acc0, acc1, acc2, acc3, acc12, acc34, post; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0.f; - *output_ptr1 = 0.f; - *output_ptr2 = 0.f; - *output_ptr3 = 0.f; - } else { - post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding))) - : vdup_n_f32(1.f / 9); - acc12 = vPoolPre_f32

(row1, row2); - acc34 = vPoolPre_f32

(row3, row4); - acc0 = vPoolPre_f32

(row0, acc12); - acc1 = vPoolPre_f32

(row3, acc12); - acc2 = vPoolPre_f32

(row2, acc34); - acc3 = vPoolPre_f32

(row5, acc34); - acc0 = vpPoolPre_f32

(acc0, acc0); - acc1 = vpPoolPre_f32

(acc1, acc1); - acc2 = vpPoolPre_f32

(acc2, acc2); - acc3 = vpPoolPre_f32

(acc3, acc3); - acc0 = vPoolPost_f32

(acc0, post); - acc1 = vPoolPost_f32

(acc1, post); - acc2 = vPoolPost_f32

(acc2, post); - acc3 = vPoolPost_f32

(acc3, post); - vst1_lane_f32(output_ptr0, acc0, 0); - vst1_lane_f32(output_ptr1, acc1, 0); - vst1_lane_f32(output_ptr2, acc2, 0); - vst1_lane_f32(output_ptr3, acc3, 0); - row0 = vext_f32(row0, pad0, 1); - row1 = vext_f32(row1, pad0, 1); - row2 = vext_f32(row2, pad0, 1); - row3 = vext_f32(row3, pad0, 1); - row4 = vext_f32(row4, pad0, 1); - row5 = vext_f32(row5, pad0, 1); - } - output_ptr0++; - output_ptr1++; - output_ptr2++; - output_ptr3++; - } - } - } - // remain height - int start_h = valid_h_start + (valid_h & 0xFFFFFFFC); - for (int h = start_h; h < valid_h_end; ++h) { - const float *input_ptr0 = input_ptr + (h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - // pad left - if (padding_w) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t pad0 = vPoolInit_f32

(); - float32x2_t acc0, post; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - w; - if (padding >= 3) { - output_ptr0[w] = 0.f; - } else { - post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding))) - : vdup_n_f32(1.f / 9); - acc0 = vPoolPre_f32

(row0, row1); - acc0 = vPoolPre_f32

(acc0, row2); - acc0 = vpPoolPre_f32

(acc0, acc0); - acc0 = vPoolPost_f32

(acc0, post); - vst1_lane_f32(output_ptr0 + w, acc0, 0); - row0 = vext_f32(pad0, row0, 1); - row1 = vext_f32(pad0, row1, 1); - row2 = vext_f32(pad0, row2, 1); - } - } - output_ptr0 += valid_w_start; - } - // valid - float32x4x2_t x0, x1, x2, y0; - float32x4_t post = vdupq_n_f32(1.f / 9); - for (int loop = 0; loop < output_w_tiles; ++loop) { - x0.val[0] = vld1q_f32(input_ptr0); - x0.val[1] = vld1q_f32(input_ptr0 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - - x0.val[0] = vld1q_f32(input_ptr1); - x0.val[1] = vld1q_f32(input_ptr1 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - - x0.val[0] = vld1q_f32(input_ptr2); - x0.val[1] = vld1q_f32(input_ptr2 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr0, y0.val[0]); - vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); - - input_ptr0 += 6; - input_ptr1 += 6; - input_ptr2 += 6; - output_ptr0 += 6; - } - // remain width - if (output_w_remain > 0) { - x0.val[0] = vld1q_f32(input_ptr0); - x0.val[1] = vld1q_f32(input_ptr0 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - - x0.val[0] = vld1q_f32(input_ptr1); - x0.val[1] = vld1q_f32(input_ptr1 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - - x0.val[0] = vld1q_f32(input_ptr2); - x0.val[1] = vld1q_f32(input_ptr2 + 4); - x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1); - x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1); - x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2); - x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x1.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - // restore - switch (output_w_remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0.val[0], 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2); - break; - case 4: - vst1q_f32(output_ptr0, y0.val[0]); - break; - case 5: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0); - break; - } - input_ptr0 += output_w_remain; - input_ptr1 += output_w_remain; - input_ptr2 += output_w_remain; - output_ptr0 += output_w_remain; - } - // pad right - if (padding_w) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t pad0 = vPoolInit_f32

(); - float32x2_t acc0, post; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0.f; - } else { - post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding))) - : vdup_n_f32(1.f / 9); - acc0 = vPoolPre_f32

(row0, row1); - acc0 = vPoolPre_f32

(acc0, row2); - acc0 = vpPoolPre_f32

(acc0, acc0); - acc0 = vPoolPost_f32

(acc0, post); - vst1_lane_f32(output_ptr0, acc0, 0); - row0 = vext_f32(row0, pad0, 1); - row1 = vext_f32(row1, pad0, 1); - row2 = vext_f32(row2, pad0, 1); - } - output_ptr0++; - } - } - } - // pad bottom - for (int h = valid_h_end; h < output_h; ++h) { - Pooling3x3NormalRow(input_ptr, h, input_h, input_w, padding_h, - padding_w, output_w, exclusive, output_ptr); - } - } - } - } -}; - -template -struct Pooling3x3 { - inline void operator()(const framework::Tensor &input, - const std::vector &paddings, const bool exclusive, - framework::Tensor *output) { - const float *input_data = input.data(); - float *output_data = output->mutable_data(); - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int padding_h = paddings[0]; - int padding_w = paddings[1]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - int valid_h_start = (padding_h + 1) / 2; - int valid_h_end = (input_h + padding_h - 1) / 2; - int valid_h = valid_h_end - valid_h_start; - int valid_w_start = (padding_w + 1) / 2; - int valid_w_end = (input_w + padding_w - 1) / 2; - int valid_w = valid_w_end - valid_w_start; - - int padding_height = input_h + 2 * padding_h; - int padding_width = input_w + 2 * padding_w; - bool ceil_mode = (((padding_height - 1) / 2) < output_h) || - (((padding_width - 1) / 2) < output_w); - int padding_b = - padding_h + (ceil_mode ? 2 * output_h - (padding_height - 1) : 0); - int padding_r = - padding_w + (ceil_mode ? 2 * output_w - (padding_width - 1) : 0); - // for pad left - int valid_input_w_start = (valid_w_start << 1) - padding_w; - - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < output->dims()[0]; ++batch) { - for (int c = 0; c < output->dims()[1]; ++c) { - int channel = batch * output->dims()[1] + c; - const float *input_ptr = input_data + channel * image_size; - float *output_ptr = output_data + channel * out_image_size; - // top - for (int h = 0; h < valid_h_start; ++h) { - Pooling3x3NormalRow(input_ptr, h, input_h, input_w, padding_h, - padding_w, output_w, exclusive, output_ptr); - } - // valid - int output_w_tiles = valid_w / 6; - int output_w_remain = valid_w - output_w_tiles * 6; - for (int h = valid_h_start; h < valid_h_end - 2; h += 3) { - const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - const float *input_ptr3 = input_ptr2 + input_w; - const float *input_ptr4 = input_ptr3 + input_w; - const float *input_ptr5 = input_ptr4 + input_w; - const float *input_ptr6 = input_ptr5 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - float *output_ptr1 = output_ptr0 + output_w; - float *output_ptr2 = output_ptr1 + output_w; - // pad left - if (padding_w) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t row3 = vld1_f32(input_ptr3); - float32x2_t row4 = vld1_f32(input_ptr4); - float32x2_t row5 = vld1_f32(input_ptr5); - float32x2_t row6 = vld1_f32(input_ptr6); - float32x2_t pad0 = vPoolInit_f32

(); - float32x2_t acc0, acc1, acc2, post; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - (w << 1); - if (padding >= 3) { - output_ptr0[w] = 0.f; - output_ptr1[w] = 0.f; - output_ptr2[w] = 0.f; - } else { - post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding))) - : vdup_n_f32(1.f / 9); - acc0 = vPoolPre_f32

(row0, row1); - acc1 = vPoolPre_f32

(row2, row3); - acc2 = vPoolPre_f32

(row4, row5); - acc0 = vPoolPre_f32

(acc0, row2); - acc1 = vPoolPre_f32

(acc1, row4); - acc2 = vPoolPre_f32

(acc2, row6); - if (padding == 1) { - acc0 = vpPoolPre_f32

(acc0, acc0); - acc1 = vpPoolPre_f32

(acc1, acc1); - acc2 = vpPoolPre_f32

(acc2, acc2); - } - acc0 = vPoolPost_f32

(acc0, post); - acc1 = vPoolPost_f32

(acc1, post); - acc2 = vPoolPost_f32

(acc2, post); - vst1_lane_f32(output_ptr0 + w, acc0, 0); - vst1_lane_f32(output_ptr1 + w, acc1, 0); - vst1_lane_f32(output_ptr2 + w, acc2, 0); - } - } - input_ptr0 += valid_input_w_start; - input_ptr1 += valid_input_w_start; - input_ptr2 += valid_input_w_start; - input_ptr3 += valid_input_w_start; - input_ptr4 += valid_input_w_start; - input_ptr5 += valid_input_w_start; - input_ptr6 += valid_input_w_start; - output_ptr0 += valid_w_start; - output_ptr1 += valid_w_start; - output_ptr2 += valid_w_start; - } - // valid - float32x4x2_t x0, x1, x2; - float32x4x2_t y0, y1, y2; - float32x4_t post = vdupq_n_f32(1.f / 9); - for (int loop = 0; loop < output_w_tiles; ++loop) { - x0 = vld2q_f32(input_ptr0); - x1 = vld2q_f32(input_ptr0 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - - x0 = vld2q_f32(input_ptr1); - x1 = vld2q_f32(input_ptr1 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - - x0 = vld2q_f32(input_ptr2); - x1 = vld2q_f32(input_ptr2 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - y1.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y1.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(y1.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr0, y0.val[0]); - vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); - - x0 = vld2q_f32(input_ptr3); - x1 = vld2q_f32(input_ptr3 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y1.val[0] = vPoolPreq_f32

(x0.val[0], y1.val[0]); - y1.val[1] = vPoolPreq_f32

(x0.val[1], y1.val[1]); - - x0 = vld2q_f32(input_ptr4); - x1 = vld2q_f32(input_ptr4 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y1.val[0] = vPoolPreq_f32

(y0.val[0], y1.val[0]); - y1.val[1] = vPoolPreq_f32

(y0.val[1], y1.val[1]); - y1.val[0] = vPoolPostq_f32

(y1.val[0], post); - y1.val[1] = vPoolPostq_f32

(y1.val[1], post); - vst1q_f32(output_ptr1, y1.val[0]); - vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1])); - - x0 = vld2q_f32(input_ptr5); - x1 = vld2q_f32(input_ptr5 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - - x0 = vld2q_f32(input_ptr6); - x1 = vld2q_f32(input_ptr6 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr2, y0.val[0]); - vst1_f32(output_ptr2 + 4, vget_low_f32(y0.val[1])); - - input_ptr0 += 12; - input_ptr1 += 12; - input_ptr2 += 12; - input_ptr3 += 12; - input_ptr4 += 12; - input_ptr5 += 12; - input_ptr6 += 12; - output_ptr0 += 6; - output_ptr1 += 6; - output_ptr2 += 6; - } - // remain width - if (output_w_remain > 0) { - x0 = vld2q_f32(input_ptr0); - x1 = vld2q_f32(input_ptr0 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - - x0 = vld2q_f32(input_ptr1); - x1 = vld2q_f32(input_ptr1 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - - x0 = vld2q_f32(input_ptr2); - x1 = vld2q_f32(input_ptr2 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - y1.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y1.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(y1.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(y1.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - - x0 = vld2q_f32(input_ptr3); - x1 = vld2q_f32(input_ptr3 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y1.val[0] = vPoolPreq_f32

(x0.val[0], y1.val[0]); - y1.val[1] = vPoolPreq_f32

(x0.val[1], y1.val[1]); - - x0 = vld2q_f32(input_ptr4); - x1 = vld2q_f32(input_ptr4 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - y2.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y2.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y1.val[0] = vPoolPreq_f32

(y2.val[0], y1.val[0]); - y1.val[1] = vPoolPreq_f32

(y2.val[1], y1.val[1]); - y1.val[0] = vPoolPostq_f32

(y1.val[0], post); - y1.val[1] = vPoolPostq_f32

(y1.val[1], post); - - x0 = vld2q_f32(input_ptr5); - x1 = vld2q_f32(input_ptr5 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y2.val[0] = vPoolPreq_f32

(x0.val[0], y2.val[0]); - y2.val[1] = vPoolPreq_f32

(x0.val[1], y2.val[1]); - - x0 = vld2q_f32(input_ptr6); - x1 = vld2q_f32(input_ptr6 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y2.val[0] = vPoolPreq_f32

(x0.val[0], y2.val[0]); - y2.val[1] = vPoolPreq_f32

(x0.val[1], y2.val[1]); - y2.val[0] = vPoolPostq_f32

(y2.val[0], post); - y2.val[1] = vPoolPostq_f32

(y2.val[1], post); - - switch (output_w_remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0.val[0], 0); - vst1q_lane_f32(output_ptr1, y1.val[0], 0); - vst1q_lane_f32(output_ptr2, y2.val[0], 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1_f32(output_ptr1, vget_low_f32(y1.val[0])); - vst1_f32(output_ptr2, vget_low_f32(y2.val[0])); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1_f32(output_ptr1, vget_low_f32(y1.val[0])); - vst1_f32(output_ptr2, vget_low_f32(y2.val[0])); - vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2); - vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2); - vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2); - break; - case 4: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_f32(output_ptr1, y1.val[0]); - vst1q_f32(output_ptr2, y2.val[0]); - break; - case 5: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_f32(output_ptr1, y1.val[0]); - vst1q_f32(output_ptr2, y2.val[0]); - vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0); - vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0); - vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0); - break; - } - input_ptr0 += (output_w_remain << 1); - input_ptr1 += (output_w_remain << 1); - input_ptr2 += (output_w_remain << 1); - input_ptr3 += (output_w_remain << 1); - input_ptr4 += (output_w_remain << 1); - input_ptr5 += (output_w_remain << 1); - input_ptr6 += (output_w_remain << 1); - output_ptr0 += output_w_remain; - output_ptr1 += output_w_remain; - output_ptr2 += output_w_remain; - } - // pad right - if (padding_r > 0) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t row3 = vld1_f32(input_ptr3); - float32x2_t row4 = vld1_f32(input_ptr4); - float32x2_t row5 = vld1_f32(input_ptr5); - float32x2_t row6 = vld1_f32(input_ptr6); - float32x2_t pad0 = vPoolInit_f32

(); - float32x2_t acc0, acc1, acc2, post; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = 2 * w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0.f; - *output_ptr1 = 0.f; - *output_ptr2 = 0.f; - } else { - post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding))) - : vdup_n_f32(1.f / 9); - acc0 = vPoolPre_f32

(row0, row1); - acc1 = vPoolPre_f32

(row2, row3); - acc2 = vPoolPre_f32

(row4, row5); - acc0 = vPoolPre_f32

(acc0, row2); - acc1 = vPoolPre_f32

(acc1, row4); - acc2 = vPoolPre_f32

(acc2, row6); - if (padding == 1) { - acc0 = vpPoolPre_f32

(acc0, acc0); - acc1 = vpPoolPre_f32

(acc1, acc1); - acc2 = vpPoolPre_f32

(acc2, acc2); - } - acc0 = vPoolPost_f32

(acc0, post); - acc1 = vPoolPost_f32

(acc1, post); - acc2 = vPoolPost_f32

(acc2, post); - vst1_lane_f32(output_ptr0, acc0, 0); - vst1_lane_f32(output_ptr1, acc1, 0); - vst1_lane_f32(output_ptr2, acc2, 0); - } - output_ptr0++; - output_ptr1++; - output_ptr2++; - } - } - } - // remain height - int start_h = valid_h_start + valid_h / 3 * 3; - for (int h = start_h; h < valid_h_end; ++h) { - const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w; - const float *input_ptr1 = input_ptr0 + input_w; - const float *input_ptr2 = input_ptr1 + input_w; - float *output_ptr0 = output_ptr + h * output_w; - // pad left - if (padding_w) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t pad0 = vPoolInit_f32

(); - float32x2_t acc0, post; - for (int w = valid_w_start - 1; w >= 0; --w) { - int padding = padding_w - (w << 1); - if (padding >= 3) { - output_ptr0[w] = 0.f; - } else { - post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding))) - : vdup_n_f32(1.f / 9); - acc0 = vPoolPre_f32

(row0, row1); - acc0 = vPoolPre_f32

(acc0, row2); - if (padding == 1) { - acc0 = vpPoolPre_f32

(acc0, acc0); - } - acc0 = vPoolPost_f32

(acc0, post); - vst1_lane_f32(output_ptr0 + w, acc0, 0); - } - } - input_ptr0 += valid_input_w_start; - input_ptr1 += valid_input_w_start; - input_ptr2 += valid_input_w_start; - output_ptr0 += valid_w_start; - } - // valid - float32x4x2_t x0, x1, x2, y0; - float32x4_t post = vdupq_n_f32(1.f / 9); - for (int loop = 0; loop < output_w_tiles; ++loop) { - x0 = vld2q_f32(input_ptr0); - x1 = vld2q_f32(input_ptr0 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - - x0 = vld2q_f32(input_ptr1); - x1 = vld2q_f32(input_ptr1 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - - x0 = vld2q_f32(input_ptr2); - x1 = vld2q_f32(input_ptr2 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - vst1q_f32(output_ptr0, y0.val[0]); - vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1])); - - input_ptr0 += 12; - input_ptr1 += 12; - input_ptr2 += 12; - output_ptr0 += 6; - } - // remain width - if (output_w_remain > 0) { - x0 = vld2q_f32(input_ptr0); - x1 = vld2q_f32(input_ptr0 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - - x0 = vld2q_f32(input_ptr1); - x1 = vld2q_f32(input_ptr1 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - - x0 = vld2q_f32(input_ptr2); - x1 = vld2q_f32(input_ptr2 + 8); - x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1); - x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x0.val[1]); - x0.val[1] = vPoolPreq_f32

(x1.val[0], x1.val[1]); - x0.val[0] = vPoolPreq_f32

(x0.val[0], x2.val[0]); - x0.val[1] = vPoolPreq_f32

(x0.val[1], x2.val[1]); - y0.val[0] = vPoolPreq_f32

(x0.val[0], y0.val[0]); - y0.val[1] = vPoolPreq_f32

(x0.val[1], y0.val[1]); - y0.val[0] = vPoolPostq_f32

(y0.val[0], post); - y0.val[1] = vPoolPostq_f32

(y0.val[1], post); - // restore - switch (output_w_remain) { - case 1: - vst1q_lane_f32(output_ptr0, y0.val[0], 0); - break; - case 2: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - break; - case 3: - vst1_f32(output_ptr0, vget_low_f32(y0.val[0])); - vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2); - break; - case 4: - vst1q_f32(output_ptr0, y0.val[0]); - break; - case 5: - vst1q_f32(output_ptr0, y0.val[0]); - vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0); - break; - } - input_ptr0 += (output_w_remain << 1); - input_ptr1 += (output_w_remain << 1); - input_ptr2 += (output_w_remain << 1); - output_ptr0 += output_w_remain; - } - // pad right - if (padding_r > 0) { - float32x2_t row0 = vld1_f32(input_ptr0); - float32x2_t row1 = vld1_f32(input_ptr1); - float32x2_t row2 = vld1_f32(input_ptr2); - float32x2_t pad0 = vPoolInit_f32

(); - float32x2_t acc0, post; - for (int w = valid_w_end; w < output_w; ++w) { - int padding = 2 * w + 3 - (padding_w + input_w); - if (padding >= 3) { - *output_ptr0 = 0.f; - } else { - post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding))) - : vdup_n_f32(1.f / 9); - acc0 = vPoolPre_f32

(row0, row1); - acc0 = vPoolPre_f32

(acc0, row2); - if (padding == 1) { - acc0 = vpPoolPre_f32

(acc0, acc0); - } - acc0 = vPoolPost_f32

(acc0, post); - vst1_lane_f32(output_ptr0, acc0, 0); - } - output_ptr0++; - } - } - } - // bottom - for (int h = valid_h_end; h < output_h; ++h) { - Pooling3x3NormalRow(input_ptr, h, input_h, input_w, padding_h, - padding_w, output_w, exclusive, output_ptr); - } - } - } - } -}; - -template struct Pooling3x3; -template struct Pooling3x3; -template struct Pooling3x3; -template struct Pooling3x3; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // __ARM_NEON -#endif // POOL_OP diff --git a/mobile/src/operators/math/quantize.h b/mobile/src/operators/math/quantize.h deleted file mode 100644 index 9f6b2437f5..0000000000 --- a/mobile/src/operators/math/quantize.h +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef QUANT_OP - -#pragma once - -#include -#include "common/types.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -inline int8_t Round(const float &x) { - return static_cast(x); -} - -template <> -inline int8_t Round(const float &x) { - return std::round(x); -} - -template <> -inline int8_t Round(const float &x) { - float v = std::round(x); - int32_t q = static_cast(v); - if (fabs(fabs(q - v) - 0.5) <= 0) { - if (abs(q) % 2 != 0) { - q = q + ((q > 0) ? -1 : 1); - } - } - return static_cast(q); -} - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -template -inline int32x4_t vRoundq_f32(const float32x4_t &x) { - return vcvtq_s32_f32(x); -} - -template <> -inline int32x4_t vRoundq_f32(const float32x4_t &x) { -#if __aarch64__ - return vcvtaq_s32_f32(x); -#else - float32x4_t plus = vdupq_n_f32(0.5); - float32x4_t minus = vdupq_n_f32(-0.5); - float32x4_t zero = vdupq_n_f32(0); - uint32x4_t more_than_zero = vcgtq_f32(x, zero); - float32x4_t temp = vbslq_f32(more_than_zero, plus, minus); - temp = vaddq_f32(x, temp); - int32x4_t ret = vcvtq_s32_f32(temp); - return ret; -#endif -} - -template <> -inline int32x4_t vRoundq_f32(const float32x4_t &x) { -#if __aarch64__ - return vcvtnq_s32_f32(x); -#else - float32x4_t point5 = vdupq_n_f32(0.5); - int32x4_t one = vdupq_n_s32(1); - int32x4_t zero = vdupq_n_s32(0); - - int32x4_t rnd = math::vRoundq_f32(x); - float32x4_t frnd = vcvtq_f32_s32(rnd); - frnd = vsubq_f32(frnd, x); - frnd = vabsq_f32(frnd); - uint32x4_t equal_point5 = vceqq_f32(frnd, point5); - int32x4_t abs_rnd = vabsq_s32(rnd); - abs_rnd = vandq_s32(abs_rnd, one); - uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd); - uint32x4_t mask = vandq_u32(equal_point5, not_mod2); - uint32x4_t more_than_zero = vcgtq_s32(rnd, zero); - more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one)); - mask = veorq_u32(more_than_zero, mask); - more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one)); - mask = vaddq_u32(more_than_zero, mask); - int32x4_t smask = vreinterpretq_s32_u32(mask); - smask = vsubq_s32(smask, one); - rnd = vaddq_s32(rnd, smask); - return rnd; -#endif -} -#endif // __ARM_NEON__ - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // QUANT_OP diff --git a/mobile/src/operators/math/selected_rows_functor.h b/mobile/src/operators/math/selected_rows_functor.h deleted file mode 100644 index f8b5521e4d..0000000000 --- a/mobile/src/operators/math/selected_rows_functor.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "framework/selected_rows.h" - -#define INLINE_FOR2(sizei, sizej) \ - for (int64_t i = 0; i < sizei; i++) \ - for (int64_t j = 0; j < sizej; j++) - -namespace paddle_mobile { -namespace operators { -namespace math { - -// SelectedRows + SelectedRows will simplely concat value and rows. -// The real computation happens in dealing with LoDTensor. -// template -// struct SelectedRowsAdd { -// void operator()( -// const framework::SelectedRows& input1, -// const framework::SelectedRows& input2, -// framework::SelectedRows* output); -//}; -// -// template -// struct SelectedRowsAddTensor { -// void operator()( -// const framework::SelectedRows& input1, -// const framework::Tensor& input2, framework::Tensor* output); -//}; - -// input2 = input1 + input2 -template -struct SelectedRowsAddTo { - void operator()(const framework::SelectedRows& input1, - const int64_t input2_offset, - framework::SelectedRows* input2) { - auto in1_height = input1.height(); - PADDLE_MOBILE_ENFORCE(in1_height == input2->height(), "height error"); - - auto& in1_rows = input1.rows(); - auto& in2_rows = *(input2->mutable_rows()); - - auto& in1_value = input1.value(); - auto* in2_value = input2->mutable_value(); - - // concat rows - in2_rows.Extend(in1_rows.begin(), in1_rows.end()); - - // auto in1_place = input1.place(); - // PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); - // auto in2_place = input2->place(); - // PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); - - auto* in1_data = in1_value.data(); - auto* in2_data = in2_value->data(); - memory::Copy(in2_data + input2_offset, in1_data, - in1_value.numel() * sizeof(T)); - } -}; - -// input2 = input1 + input2 -template -struct SelectedRowsAddToTensor { - void operator()(const framework::SelectedRows& input1, - framework::Tensor* input2) { - auto in1_height = input1.height(); - auto in2_dims = input2->dims(); - PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0], "height != dims[0]"); - - auto& in1_value = input1.value(); - auto& in1_rows = input1.rows(); - - int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height, - "row_numel error"); - - auto* in1_data = in1_value.data(); - auto* input2_data = input2->data(); - - for (size_t i = 0; i < in1_rows.size(); i++) { - for (int64_t j = 0; j < in1_row_numel; j++) { - input2_data[in1_rows[i] * in1_row_numel + j] += - in1_data[i * in1_row_numel + j]; - } - } - } -}; - -// namespace scatter { -//// functors for manuplating SelectedRows data -// template -// struct MergeAdd { -// // unary functor, merge by adding duplicated rows in -// // the input SelectedRows object. -// framework::SelectedRows operator()( -// const framework::SelectedRows& input); -//}; - -// template -// struct Add { -// framework::SelectedRows operator()( -// const framework::SelectedRows& input1, -// const framework::SelectedRows& input2) { -// framework::SelectedRows out; -// out.set_rows(input1.rows()); -// out.set_height(input1.height()); -// out.mutable_value()->mutable_data(input1.value().dims(), -// ); -// auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); -// auto e_in1 = framework::EigenVector::Flatten(input1.value()); -// auto e_in2 = framework::EigenVector::Flatten(input2.value()); -// e_out.device(*context.eigen_device()) = e_in1 + e_in2; -// return out; -// } -//}; - -// template -// struct Mul { -// // multiply two SelectedRows -// framework::SelectedRows operator()( -// const framework::SelectedRows& input1, -// const framework::SelectedRows& input2) { -// framework::SelectedRows out; -// out.set_rows(input1.rows()); -// out.set_height(input1.height()); -// out.mutable_value()->mutable_data(input1.value().dims() -// ); -// auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); -// auto e_in1 = framework::EigenVector::Flatten(input1.value()); -// auto e_in2 = framework::EigenVector::Flatten(input2.value()); -// e_out.device(*context.eigen_device()) = e_in1 * e_in2; -// return out; -// } -// // multiply scalar to SelectedRows -// framework::SelectedRows operator()( -// const framework::SelectedRows& input1, -// const T input2) { -// framework::SelectedRows out; -// out.set_rows(input1.rows()); -// out.set_height(input1.height()); -// out.mutable_value()->mutable_data(input1.value().dims(), -// ); -// auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); -// auto e_in1 = framework::EigenVector::Flatten(input1.value()); -// e_out.device(*context.eigen_device()) = input2 * e_in1; -// return out; -// } -//}; - -enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; - -// out = seleted_rows_in / tensor -template -struct UpdateToTensor { - void operator()(const ScatterOps& op, const framework::SelectedRows& input1, - framework::Tensor* input2); -}; - -// namespace scatter -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/sequence2batch.cpp b/mobile/src/operators/math/sequence2batch.cpp deleted file mode 100644 index 097a258ddd..0000000000 --- a/mobile/src/operators/math/sequence2batch.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/math/sequence2batch.h" -#include -#include "common/types.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -class CopyMatrixRowsFunctor { - public: - void operator()(const framework::Tensor& src, std::vector index_lod, - framework::Tensor* dst, bool is_src_index) { - size_t* index = index_lod.data(); - auto src_dims = src.dims(); - auto dst_dims = dst->dims(); - PADDLE_MOBILE_ENFORCE((src_dims.size() == 2UL), - "The src must be matrix with rank 2."); - PADDLE_MOBILE_ENFORCE((dst_dims.size() == 2UL), - "The dst must be matrix with rank 2."); - PADDLE_MOBILE_ENFORCE((src_dims[1] == dst_dims[1]), - "The width of src and dst must be same."); - auto height = dst_dims[0]; - auto width = dst_dims[1]; - auto* src_data = src.data(); - auto* dst_data = dst->data(); - for (int i = 0; i < height; ++i) { - if (is_src_index) { - memcpy(dst_data + i * width, src_data + index[i] * width, - width * sizeof(T)); - } else { - memcpy(dst_data + index[i] * width, src_data + i * width, - width * sizeof(T)); - } - } - } -}; - -template class CopyMatrixRowsFunctor; - -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/sequence2batch.h b/mobile/src/operators/math/sequence2batch.h deleted file mode 100644 index 537f2326d0..0000000000 --- a/mobile/src/operators/math/sequence2batch.h +++ /dev/null @@ -1,169 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "framework/lod_tensor.h" -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { -template -class CopyMatrixRowsFunctor { - public: - // If is_src_index is true, - // copy the indexed rows of input src to the output dst. - // If is_src_index is false, - // copy the input src to the indexed rows of output dst. - // The indexed rows are based on the input index. - void operator()(const framework::Tensor& src, std::vector index_lod, - framework::Tensor* dst, bool is_src_index); -}; - -template -class LoDTensor2BatchFunctor { - // Calculate the length of each sequence and - // sort sequence index by the length. - // example: sequences = {s0, s1, s2} - // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 - // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} - // - struct SeqInfo { - SeqInfo(int start, int length, int seq_idx) - : start(start), length(length), seq_idx(seq_idx) {} - int start; - int length; - int seq_idx; - }; - - public: - void operator()(const framework::LoDTensor& lod_tensor, - framework::LoDTensor* batch, bool is_cal_batch_lod, - bool is_reverse = false) { - if (!is_cal_batch_lod) { - auto lods = batch->lod(); - PADDLE_MOBILE_ENFORCE( - (lods.size() > 2UL), - "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."); - PADDLE_MOBILE_ENFORCE( - (lods[1].size() == static_cast(lod_tensor.dims()[0])), - "The LoD information should be consistent with the dims."); - CopyMatrixRowsFunctor to_batch; - to_batch(lod_tensor, lods[1], batch, true); - return; - } - - auto lods = lod_tensor.lod(); - PADDLE_MOBILE_ENFORCE((lods.size() == 1UL), - "Only support 1 level sequence, but %d is given", - lods.size()); - - const auto& lod = lods[0]; - std::vector seq_info; - for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { - int length = lod[seq_id + 1] - lod[seq_id]; - seq_info.emplace_back(lod[seq_id], length, seq_id); - } - - std::sort(seq_info.begin(), seq_info.end(), - [](SeqInfo a, SeqInfo b) { return a.length > b.length; }); - - // Calculate the start position of each batch. - // example: sequences = {s0, s1, s2} - // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 - // num_batch = 5, - // batchIndex = {b0, b1, b2, b3, b4} - // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 - // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} - // batch_start_positions[0] = len(b0) - // batch_start_positions[1] = len(b0) + len(b1) - // batch_start_positions[2] = len(b0) + len(b1) + len(b2) - // ... - // seq2batch_idx[12] = {4, 0, 9, - // 5, 1, 10, - // 6, 2, 11, - // 7, 3, - // 8} - // seq_order = {1, 0, 2}, the sort order. - // where 1 is the second sequence, - // 0 is the first sequence, - // 2 is the third sequence. - // The num_batch represents batch size after rearranging the - // input LodTensor. It is also the maximum length of input sequence. - - framework::LoD batch_lods; - batch_lods.emplace_back(std::vector{0}); - batch_lods.emplace_back(std::vector{0}); - batch_lods.emplace_back(std::vector{0}); - - // batch_lods[0] is the start positions for batch LoDTensor - int num_batch = seq_info[0].length; - batch_lods[0].resize(static_cast(num_batch + 1)); - // batch_lods[1] is the raw index in the input LoDTensor - batch_lods[1].resize(static_cast(lod_tensor.dims()[0])); - // batch_lods[2] is the sort order for the input LoDTensor. - batch_lods[2].resize(seq_info.size()); - - size_t* batch_starts = batch_lods[0].data(); - size_t* seq2batch_idx = batch_lods[1].data(); - batch_starts[0] = 0; - for (int n = 0; n < num_batch; n++) { - auto batch_id = static_cast(batch_starts[n]); - for (size_t i = 0; i < seq_info.size(); ++i) { - int seq_len = seq_info[i].length; - int start = seq_info[i].start; - if (n < seq_len) { - seq2batch_idx[batch_id] = - is_reverse ? start + seq_len - 1 - n : start + n; - batch_id++; - } else { - break; - } - } - batch_starts[n + 1] = static_cast(batch_id); - } - size_t* seq_order = batch_lods[2].data(); - for (size_t i = 0; i < seq_info.size(); ++i) { - seq_order[i] = seq_info[i].seq_idx; - } - batch->set_lod(batch_lods); - - CopyMatrixRowsFunctor to_batch; - to_batch(lod_tensor, batch_lods[1], batch, true); - } -}; - -template -class Batch2LoDTensorFunctor { - public: - void operator()(const framework::LoDTensor& batch, - framework::LoDTensor* lod_tensor) { - auto in_lod = batch.lod(); - PADDLE_MOBILE_ENFORCE( - (in_lod.size() > 2UL), - "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."); - PADDLE_MOBILE_ENFORCE( - (in_lod[1].size() == static_cast(lod_tensor->dims()[0])), - "The LoD information should be consistent with the dims."); - CopyMatrixRowsFunctor to_seq; - to_seq(batch, in_lod[1], lod_tensor, false); - } -}; -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/slidingwindow_conv3x3.cpp b/mobile/src/operators/math/slidingwindow_conv3x3.cpp deleted file mode 100644 index 0f4fbcbd93..0000000000 --- a/mobile/src/operators/math/slidingwindow_conv3x3.cpp +++ /dev/null @@ -1,5668 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/math/slidingwindow_conv3x3.h" -#include -#include "framework/context.h" -#include "operators/math/slidingwindow_utils.h" -#if __ARM_NEON -#include -#endif -#ifdef _OPENMP -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { -template <> -void SlidingwindowConv3x3s1(const framework::Tensor *input, - const framework::Tensor *filter, - const std::vector &paddings, - framework::Tensor *output) { - const int batch = input->dims()[0]; - const int input_ch = input->dims()[1]; - const int input_h = input->dims()[2]; - const int input_w = input->dims()[3]; - const int output_ch = output->dims()[1]; - const int output_h = output->dims()[2]; - const int output_w = output->dims()[3]; - const int padding_h = paddings[0]; - const int padding_w = paddings[1]; - - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - const float *filter_data = filter->data(); - - const int in_ch_size = input_h * input_w; - const int in_batch_size = input_ch * in_ch_size; - const int out_ch_size = output_h * output_w; - const int out_batch_size = output_ch * out_ch_size; - const int out_size = batch * out_batch_size; - const int filter_ch_size = 9; - const int pad_filter_ch_size = (2 * padding_h + 3) * (2 * padding_w + 3); - const int pad_filter_start = - 2 * padding_h * (2 * padding_w + 3) + 2 * padding_w; - const int pad_filter_w = 3 + padding_w * 2; - bool if_nopadding = false; - -#if __ARM_NEON - float *out_ptr = output_data; - int remain = out_size & 0x3; - float32x4_t _zero = vdupq_n_f32(0.0); - - for (int i = 0; i < out_size; i += 4) { - vst1q_f32(out_ptr, _zero); - out_ptr += 4; - } - switch (remain) { - case 1: - vst1q_lane_f32(out_ptr, _zero, 0); - break; - case 2: - vst1_f32(out_ptr, vget_low_f32(_zero)); - break; - case 3: - vst1_f32(out_ptr, vget_low_f32(_zero)); - vst1q_lane_f32(out_ptr + 2, _zero, 0); - break; - } -#else -#pragma omp parallel for - for (int i = 0; i < out_size; ++i) { - output_data[i] = 0; - } -#endif - if (padding_h == 0 && padding_w == 0) { - if_nopadding = true; - } - - for (int b = 0; b < batch; ++b) { -#pragma omp parallel for - for (int o_c = 0; o_c < output_ch - 1; o_c += 2) { - bool issamefilter; - const float *f1; - const float *f1_c2; - const float *in_ptr1, *in_ptr2, *in_ptr3, *in_ptr4; - const float *pad_filter0, *pad_filter1, *pad_filter2, *pad_filter3; - const float *pad_filter0_c2, *pad_filter1_c2, *pad_filter2_c2, - *pad_filter3_c2; - float pad_filter_arr[pad_filter_ch_size]; - float pad_filter_arr_c2[pad_filter_ch_size]; - - float *output_data_ch; - float *output_data_ch_2; - const float *input_data_ch; - const float *filter_data_ch; - const float *filter_data_ch_c2; - - filter_data_ch = filter_data + o_c * filter_ch_size * input_ch; - filter_data_ch_c2 = filter_data + (o_c + 1) * filter_ch_size * input_ch; - - input_data_ch = input_data; - output_data_ch = output_data + o_c * out_ch_size; - output_data_ch_2 = output_data + (o_c + 1) * out_ch_size; - - for (int i_c = 0; i_c < input_ch; ++i_c) { - f1 = filter_data_ch; - f1_c2 = filter_data_ch_c2; - - if (!if_nopadding) { - memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr)); - memset(pad_filter_arr_c2, 0.f, sizeof(pad_filter_arr_c2)); - for (int i = 0; i < 9; i++) { - int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 + - padding_w * (2 * padding_h + 1); - pad_filter_arr[j] = filter_data_ch[i]; - pad_filter_arr_c2[j] = filter_data_ch_c2[i]; - } - pad_filter1 = pad_filter_arr; - pad_filter1 += pad_filter_start; - pad_filter0 = pad_filter1 - pad_filter_w; - pad_filter2 = pad_filter1 + pad_filter_w; - pad_filter3 = pad_filter2 + pad_filter_w; - - pad_filter1_c2 = pad_filter_arr_c2; - pad_filter1_c2 += pad_filter_start; - pad_filter0_c2 = pad_filter1_c2 - pad_filter_w; - pad_filter2_c2 = pad_filter1_c2 + pad_filter_w; - pad_filter3_c2 = pad_filter2_c2 + pad_filter_w; - } else { - pad_filter1 = filter_data_ch; - pad_filter2 = pad_filter1 + 3; - pad_filter3 = pad_filter2 + 3; - - pad_filter1_c2 = filter_data_ch_c2; - pad_filter2_c2 = pad_filter1_c2 + 3; - pad_filter3_c2 = pad_filter2_c2 + 3; - } - float *out_ptr1, *out_ptr2; - float *out_ptr1_c2, *out_ptr2_c2; - - out_ptr1 = output_data_ch; - out_ptr2 = out_ptr1 + output_w; - out_ptr1_c2 = output_data_ch_2; - out_ptr2_c2 = out_ptr1_c2 + output_w; - - in_ptr1 = input_data_ch; - in_ptr2 = in_ptr1 + input_w; - in_ptr3 = in_ptr2 + input_w; - in_ptr4 = in_ptr3 + input_w; - - int o_h = 0; - for (; o_h < output_h - 1; o_h = o_h + 2) { - if (!if_nopadding && - (o_h < padding_h || o_h > output_h - padding_h - 2)) { - issamefilter = false; - } else { - issamefilter = true; - } - int o_w = 0; - // pad left - for (; o_w < padding_w; ++o_w) { - float sum1 = 0; - float sum2 = 0; - float sum1_c2 = 0; - float sum2_c2 = 0; - - if (issamefilter) { -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); - float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1); - float32x4_t _sum2_c2 = vmulq_f32(_in_ptr2, _pad_filter1_c2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); - _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); - _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2); - - float32x4_t _in_ptr4 = vld1q_f32(in_ptr4); - _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3); - _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr4, _pad_filter3_c2); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); - _sum2 = vsetq_lane_f32(sum2, _sum2, 3); - _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss1_2 = - vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ss2_2 = - vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2)); - float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); - float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2); - - sum1 += vget_lane_f32(_ssss1_ssss2, 0); - sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0); - sum2 += vget_lane_f32(_ssss1_ssss2, 1); - sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum2 += in_ptr2[0] * pad_filter1[0]; - sum2 += in_ptr2[1] * pad_filter1[1]; - sum2 += in_ptr2[2] * pad_filter1[2]; - sum2 += in_ptr3[0] * pad_filter2[0]; - sum2 += in_ptr3[1] * pad_filter2[1]; - sum2 += in_ptr3[2] * pad_filter2[2]; - sum2 += in_ptr4[0] * pad_filter3[0]; - sum2 += in_ptr4[1] * pad_filter3[1]; - sum2 += in_ptr4[2] * pad_filter3[2]; - - sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; - sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; - sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; - sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; - sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; - sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; - sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; - sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; - sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; - - sum2_c2 += in_ptr2[0] * pad_filter1_c2[0]; - sum2_c2 += in_ptr2[1] * pad_filter1_c2[1]; - sum2_c2 += in_ptr2[2] * pad_filter1_c2[2]; - sum2_c2 += in_ptr3[0] * pad_filter2_c2[0]; - sum2_c2 += in_ptr3[1] * pad_filter2_c2[1]; - sum2_c2 += in_ptr3[2] * pad_filter2_c2[2]; - sum2_c2 += in_ptr4[0] * pad_filter3_c2[0]; - sum2_c2 += in_ptr4[1] * pad_filter3_c2[1]; - sum2_c2 += in_ptr4[2] * pad_filter3_c2[2]; -#endif - } else { -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); - float32x4_t _pad_filter0 = vld1q_f32(pad_filter0); - float32x4_t _pad_filter0_c2 = vld1q_f32(pad_filter0_c2); - - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); - float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0); - float32x4_t _sum2_c2 = vmulq_f32(_in_ptr1, _pad_filter0_c2); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); - - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); - _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1); - _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr2, _pad_filter1_c2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); - - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); - _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); - _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); - _sum2 = vsetq_lane_f32(sum2, _sum2, 3); - _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss1_2 = - vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ss2_2 = - vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2)); - float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); - float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2); - - sum1 += vget_lane_f32(_ssss1_ssss2, 0); - sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0); - sum2 += vget_lane_f32(_ssss1_ssss2, 1); - sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum2 += in_ptr1[0] * pad_filter0[0]; - sum2 += in_ptr1[1] * pad_filter0[1]; - sum2 += in_ptr1[2] * pad_filter0[2]; - sum2 += in_ptr2[0] * pad_filter1[0]; - sum2 += in_ptr2[1] * pad_filter1[1]; - sum2 += in_ptr2[2] * pad_filter1[2]; - sum2 += in_ptr3[0] * pad_filter2[0]; - sum2 += in_ptr3[1] * pad_filter2[1]; - sum2 += in_ptr3[2] * pad_filter2[2]; - - sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; - sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; - sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; - sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; - sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; - sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; - sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; - sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; - sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; - - sum2_c2 += in_ptr1[0] * pad_filter0_c2[0]; - sum2_c2 += in_ptr1[1] * pad_filter0_c2[1]; - sum2_c2 += in_ptr1[2] * pad_filter0_c2[2]; - sum2_c2 += in_ptr2[0] * pad_filter1_c2[0]; - sum2_c2 += in_ptr2[1] * pad_filter1_c2[1]; - sum2_c2 += in_ptr2[2] * pad_filter1_c2[2]; - sum2_c2 += in_ptr3[0] * pad_filter2_c2[0]; - sum2_c2 += in_ptr3[1] * pad_filter2_c2[1]; - sum2_c2 += in_ptr3[2] * pad_filter2_c2[2]; -#endif - } - if (!if_nopadding && - (o_w < padding_w || o_w > output_w - padding_w - 2)) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - pad_filter0_c2--; - pad_filter1_c2--; - pad_filter2_c2--; - pad_filter3_c2--; - } else { - in_ptr1++; - in_ptr2++; - in_ptr3++; - in_ptr4++; - } - *out_ptr1 += sum1; - *out_ptr2 += sum2; - *out_ptr1_c2 += sum1_c2; - *out_ptr2_c2 += sum2_c2; - - out_ptr1++; - out_ptr2++; - out_ptr1_c2++; - out_ptr2_c2++; - } - // valid -#if __ARM_NEON -#if __aarch64__ - if (issamefilter) { - int loop = (output_w - 2 * padding_w) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - "prfm pldl1keep, [%[f1], #256] \n\t" - "prfm pldl1keep, [%[f1_c2], #256] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" - "ld1 {v2.4s, v3.4s}, [%[f1_c2]], #32 \n\t" - "ld1 {v4.s}[0], [%[f1]] \n\t" - - "sub %[f1],%[f1], #32 \n\t" - "ld1 {v4.s}[1], [%[f1_c2]] \n\t" - "sub %[f1_c2],%[f1_c2], #32 \n\t" - - "prfm pldl1keep, [%[in_ptr1], #192] \n\t" - "prfm pldl1keep, [%[in_ptr4], #192] \n\t" - - "ld1 {v5.4s, v6.4s}, [%[in_ptr1]] \n\t" - "add %[in_ptr1],%[in_ptr1], #16 \n\t" - - "ld1 {v6.d}[1], [%[in_ptr4]] \n\t" - "add %[in_ptr4],%[in_ptr4], #8 \n\t" - "ld1 {v7.4s}, [%[in_ptr4]] \n\t" - "add %[in_ptr4],%[in_ptr4], #8 \n\t" - - "0: \n\t" - // load out_ptr - "prfm pldl1keep, [%[out_ptr1], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c2], #128] \n\t" - "prfm pldl1keep, [%[out_ptr2], #128] \n\t" - "prfm pldl1keep, [%[out_ptr2_c2], #128] \n\t" - - "ld1 {v12.4s}, [%[out_ptr1]] \n\t" - "ld1 {v13.4s}, [%[out_ptr1_c2]] \n\t" - "ld1 {v14.4s}, [%[out_ptr2]] \n\t" - "ld1 {v15.4s}, [%[out_ptr2_c2]] \n\t" - - // in_ptr1 and in_ptr4 multiply - "ext v8.16b, v5.16b, v6.16b, #4 \n\t" - "fmla v12.4s, v5.4s, v0.s[0] \n\t" - "fmla v13.4s, v5.4s, v2.s[0] \n\t" - - "ext v9.16b, v6.16b, v7.16b, #8 \n\t" - "fmla v14.4s, v7.4s, v4.s[0] \n\t" - "fmla v15.4s, v7.4s, v4.s[1] \n\t" - - "ext v10.16b, v5.16b, v6.16b, #8 \n\t" - "fmla v12.4s, v8.4s, v0.s[1] \n\t" - "fmla v13.4s, v8.4s, v2.s[1] \n\t" - - "ext v11.16b, v6.16b, v7.16b, #12 \n\t" - "fmla v14.4s, v9.4s, v1.s[2] \n\t" - "fmla v15.4s, v9.4s, v3.s[2] \n\t" - - "ld1 {v5.4s, v6.4s}, [%[in_ptr2]] \n\t" - "fmla v12.4s, v10.4s, v0.s[2] \n\t" - "fmla v13.4s, v10.4s, v2.s[2] \n\t" - - "add %[in_ptr2],%[in_ptr2], #16 \n\t" - "fmla v14.4s, v11.4s, v1.s[3] \n\t" - "fmla v15.4s, v11.4s, v3.s[3] \n\t" - - // in_ptr2 multiply - "ext v8.16b, v5.16b, v6.16b, #4 \n\t" - "fmla v12.4s, v5.4s, v0.s[3] \n\t" - "fmla v13.4s, v5.4s, v2.s[3] \n\t" - - "fmla v14.4s, v5.4s, v0.s[0] \n\t" - "fmla v15.4s, v5.4s, v2.s[0] \n\t" - - "ext v9.16b, v5.16b, v6.16b, #8 \n\t" - "fmla v12.4s, v8.4s, v1.s[0] \n\t" - "fmla v13.4s, v8.4s, v3.s[0] \n\t" - - "ld1 {v6.d}[1], [%[in_ptr3]] \n\t" - "add %[in_ptr3],%[in_ptr3], #8 \n\t" - "fmla v14.4s, v8.4s, v0.s[1] \n\t" - "fmla v15.4s, v8.4s, v2.s[1] \n\t" - - "ld1 {v7.4s}, [%[in_ptr3]] \n\t" - "add %[in_ptr3],%[in_ptr3], #8 \n\t" - - "fmla v12.4s, v9.4s, v1.s[1] \n\t" - "fmla v13.4s, v9.4s, v3.s[1] \n\t" - - "ext v10.16b, v6.16b, v7.16b, #8 \n\t" - "fmla v14.4s, v9.4s, v0.s[2] \n\t" - "fmla v15.4s, v9.4s, v2.s[2] \n\t" - - // in_ptr3 multiply - "fmla v12.4s, v7.4s, v4.s[0] \n\t" - "fmla v13.4s, v7.4s, v4.s[1] \n\t" - - "ext v11.16b, v6.16b, v7.16b, #12 \n\t" - "fmla v14.4s, v7.4s, v1.s[1] \n\t" - "fmla v15.4s, v7.4s, v3.s[1] \n\t" - - "fmla v12.4s, v10.4s, v1.s[2] \n\t" - "fmla v13.4s, v10.4s, v3.s[2] \n\t" - - "fmla v14.4s, v10.4s, v0.s[3] \n\t" - "fmla v15.4s, v10.4s, v2.s[3] \n\t" - - "fmla v12.4s, v11.4s, v1.s[3] \n\t" - "fmla v13.4s, v11.4s, v3.s[3] \n\t" - - "prfm pldl1keep, [%[in_ptr1], #192] \n\t" - "fmla v14.4s, v11.4s, v1.s[0] \n\t" - "fmla v15.4s, v11.4s, v3.s[0] \n\t" - - // store out_ptr - "prfm pldl1keep, [%[in_ptr4], #192] \n\t" - "ld1 {v5.4s, v6.4s}, [%[in_ptr1]] \n\t" - "add %[in_ptr1],%[in_ptr1], #16 \n\t" - "st1 {v12.4s}, [%[out_ptr1]], #16 \n\t" - - "ld1 {v6.d}[1], [%[in_ptr4]] \n\t" - "add %[in_ptr4],%[in_ptr4], #8 \n\t" - "st1 {v13.4s}, [%[out_ptr1_c2]], #16 \n\t" - - "ld1 {v7.4s}, [%[in_ptr4]] \n\t" - "add %[in_ptr4],%[in_ptr4], #8 \n\t" - "st1 {v14.4s}, [%[out_ptr2]], #16 \n\t" - - "subs %[loop],%[loop], #1 \n\t" - "st1 {v15.4s}, [%[out_ptr2_c2]], #16 \n\t" - - // cycle - "bne 0b \n\t" - "sub %[in_ptr1],%[in_ptr1], #16 \n\t" - "sub %[in_ptr4],%[in_ptr4], #16 \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2), - [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1), - [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), - [in_ptr4] "+r"(in_ptr4) - : [f1] "r"(f1), [f1_c2] "r"(f1_c2) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); - } - } - if (!if_nopadding && o_w == output_w - padding_w) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - pad_filter0_c2--; - pad_filter1_c2--; - pad_filter2_c2--; - pad_filter3_c2--; - - in_ptr1--; - in_ptr2--; - in_ptr3--; - in_ptr4--; - } -#else - if (issamefilter) { - int loop = (output_w - 2 * padding_w) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - "pld [%[f1], #256] \n\t" - "pld [%[f1_c2], #256] \n\t" - - "vld1.f32 {d0-d3}, [%[f1]] \n\t" - "add %[f1], #32 \n\t" - "vld1.f32 {d4-d7}, [%[f1_c2]] \n\t" - "add %[f1_c2], #32 \n\t" - - "vld1.f32 {d8[0]}, [%[f1]] \n\t" - "sub %[f1], #32 \n\t" - "vld1.f32 {d8[1]}, [%[f1_c2]] \n\t" - "sub %[f1_c2], #32 \n\t" - - "pld [%[in_ptr1], #192] \n\t" - "pld [%[in_ptr4], #192] \n\t" - - "vld1.f32 {d10-d12}, [%[in_ptr1]] \n\t" - "add %[in_ptr1], #16 \n\t" - - "vld1.f32 {d13-d15}, [%[in_ptr4]] \n\t" - "add %[in_ptr4], #16 \n\t" - - "0: \n\t" - // load out_ptr - "pld [%[out_ptr1], #128] \n\t" - "pld [%[out_ptr1_c2], #128] \n\t" - "pld [%[out_ptr2], #128] \n\t" - "pld [%[out_ptr2_c2], #128] \n\t" - - "vld1.f32 {d24, d25}, [%[out_ptr1]] \n\t" - "vld1.f32 {d26, d27}, [%[out_ptr1_c2]] \n\t" - "vld1.f32 {d28, d29}, [%[out_ptr2]] \n\t" - "vld1.f32 {d30, d31}, [%[out_ptr2_c2]] \n\t" - - // in_ptr1 + in_ptr4 multiply - "vext.32 q8, q5, q6, #1 \n\t" - "vmla.f32 q12, q5, d0[0] \n\t" - "vmla.f32 q13, q5, d4[0] \n\t" - - "vext.32 q9, q6, q7, #2 \n\t" - "vmla.f32 q14, q7, d8[0] \n\t" - "vmla.f32 q15, q7, d8[1] \n\t" - - "vext.32 q10, q5, q6, #2 \n\t" - "vmla.f32 q12, q8, d0[1] \n\t" - "vmla.f32 q13, q8, d4[1] \n\t" - - "vext.32 q11, q6, q7, #3 \n\t" - "vmla.f32 q14, q9, d3[0] \n\t" - "vmla.f32 q15, q9, d7[0] \n\t" - - "vld1.f32 {d10-d12}, [%[in_ptr2]] \n\t" - "add %[in_ptr2], #16 \n\t" - "vmla.f32 q12, q10, d1[0] \n\t" - "vmla.f32 q13, q10, d5[0] \n\t" - - "vmla.f32 q14, q11, d3[1] \n\t" - "vmla.f32 q15, q11, d7[1] \n\t" - - // in_ptr2 multiply - "vext.32 q8, q5, q6, #1 \n\t" - "vmla.f32 q12, q5, d1[1] \n\t" - "vmla.f32 q13, q5, d5[1] \n\t" - - "vmla.f32 q14, q5, d0[0] \n\t" - "vmla.f32 q15, q5, d4[0] \n\t" - - "vext.32 q9, q5, q6, #2 \n\t" - "vmla.f32 q12, q8, d2[0] \n\t" - "vmla.f32 q13, q8, d6[0] \n\t" - - "vld1.f32 {d13-d15}, [%[in_ptr3]] \n\t" - "add %[in_ptr3], #16 \n\t" - "vmla.f32 q14, q8, d0[1] \n\t" - "vmla.f32 q15, q8, d4[1] \n\t" - - "vmla.f32 q12, q9, d2[1] \n\t" - "vmla.f32 q13, q9, d6[1] \n\t" - - "vmla.f32 q14, q9, d1[0] \n\t" - "vmla.f32 q15, q9, d5[0] \n\t" - - // in_ptr3 multiply - "vext.32 q10, q6, q7, #2 \n\t" - "vmla.f32 q12, q7, d8[0] \n\t" - "vmla.f32 q13, q7, d8[1] \n\t" - "vmla.f32 q14, q7, d2[1] \n\t" - "vmla.f32 q15, q7, d6[1] \n\t" - - "vext.32 q11, q6, q7, #3 \n\t" - "vmla.f32 q12, q10, d3[0] \n\t" - "vmla.f32 q13, q10, d7[0] \n\t" - "vmla.f32 q14, q10, d1[1] \n\t" - "vmla.f32 q15, q10, d5[1] \n\t" - - "vmla.f32 q12, q11, d3[1] \n\t" - "vmla.f32 q13, q11, d7[1] \n\t" - "vmla.f32 q14, q11, d2[0] \n\t" - "vmla.f32 q15, q11, d6[0] \n\t" - - // store out_ptr - "pld [%[in_ptr1], #192] \n\t" - - "pld [%[in_ptr4], #192] \n\t" - "vld1.f32 {d10-d12}, [%[in_ptr1]] \n\t" - "add %[in_ptr1], #16 \n\t" - - "vst1.f32 {d24, d25}, [%[out_ptr1]]! \n\t" - - "vst1.f32 {d26, d27}, [%[out_ptr1_c2]]! \n\t" - "vld1.f32 {d13-d15}, [%[in_ptr4]] \n\t" - - "add %[in_ptr4], #16 \n\t" - "vst1.f32 {d28, d29}, [%[out_ptr2]]! \n\t" - - "subs %[loop], #1 \n\t" - "vst1.f32 {d30, d31}, [%[out_ptr2_c2]]! \n\t" - - // cycle - "bne 0b \n\t" - "sub %[in_ptr1], #16 \n\t" - "sub %[in_ptr4], #16 \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2), - [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1), - [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), - [in_ptr4] "+r"(in_ptr4) - : [f1] "r"(f1), [f1_c2] "r"(f1_c2) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); - } - } - if (!if_nopadding && o_w == output_w - padding_w) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - pad_filter0_c2--; - pad_filter1_c2--; - pad_filter2_c2--; - pad_filter3_c2--; - - in_ptr1--; - in_ptr2--; - in_ptr3--; - in_ptr4--; - } -#endif // __aarch64__ -#endif // __ARM_NEON - - // remain output_width - for (; o_w < output_w; ++o_w) { - float sum1 = 0; - float sum2 = 0; - float sum1_c2 = 0; - float sum2_c2 = 0; - - if (issamefilter) { -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); - float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1); - float32x4_t _sum2_c2 = vmulq_f32(_in_ptr2, _pad_filter1_c2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); - _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); - _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2); - - float32x4_t _in_ptr4 = vld1q_f32(in_ptr4); - _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3); - _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr4, _pad_filter3_c2); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); - _sum2 = vsetq_lane_f32(sum2, _sum2, 3); - _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss1_2 = - vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ss2_2 = - vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2)); - float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); - float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2); - - sum1 += vget_lane_f32(_ssss1_ssss2, 0); - sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0); - sum2 += vget_lane_f32(_ssss1_ssss2, 1); - sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum2 += in_ptr2[0] * pad_filter1[0]; - sum2 += in_ptr2[1] * pad_filter1[1]; - sum2 += in_ptr2[2] * pad_filter1[2]; - sum2 += in_ptr3[0] * pad_filter2[0]; - sum2 += in_ptr3[1] * pad_filter2[1]; - sum2 += in_ptr3[2] * pad_filter2[2]; - sum2 += in_ptr4[0] * pad_filter3[0]; - sum2 += in_ptr4[1] * pad_filter3[1]; - sum2 += in_ptr4[2] * pad_filter3[2]; - - sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; - sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; - sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; - sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; - sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; - sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; - sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; - sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; - sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; - - sum2_c2 += in_ptr2[0] * pad_filter1_c2[0]; - sum2_c2 += in_ptr2[1] * pad_filter1_c2[1]; - sum2_c2 += in_ptr2[2] * pad_filter1_c2[2]; - sum2_c2 += in_ptr3[0] * pad_filter2_c2[0]; - sum2_c2 += in_ptr3[1] * pad_filter2_c2[1]; - sum2_c2 += in_ptr3[2] * pad_filter2_c2[2]; - sum2_c2 += in_ptr4[0] * pad_filter3_c2[0]; - sum2_c2 += in_ptr4[1] * pad_filter3_c2[1]; - sum2_c2 += in_ptr4[2] * pad_filter3_c2[2]; -#endif - } else { -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); - float32x4_t _pad_filter0 = vld1q_f32(pad_filter0); - float32x4_t _pad_filter0_c2 = vld1q_f32(pad_filter0_c2); - - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); - float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0); - float32x4_t _sum2_c2 = vmulq_f32(_in_ptr1, _pad_filter0_c2); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); - - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); - _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1); - _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr2, _pad_filter1_c2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); - - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); - _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); - _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); - _sum2 = vsetq_lane_f32(sum2, _sum2, 3); - _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss1_2 = - vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ss2_2 = - vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2)); - float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); - float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2); - - sum1 += vget_lane_f32(_ssss1_ssss2, 0); - sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0); - sum2 += vget_lane_f32(_ssss1_ssss2, 1); - sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum2 += in_ptr1[0] * pad_filter0[0]; - sum2 += in_ptr1[1] * pad_filter0[1]; - sum2 += in_ptr1[2] * pad_filter0[2]; - sum2 += in_ptr2[0] * pad_filter1[0]; - sum2 += in_ptr2[1] * pad_filter1[1]; - sum2 += in_ptr2[2] * pad_filter1[2]; - sum2 += in_ptr3[0] * pad_filter2[0]; - sum2 += in_ptr3[1] * pad_filter2[1]; - sum2 += in_ptr3[2] * pad_filter2[2]; - - sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; - sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; - sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; - sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; - sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; - sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; - sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; - sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; - sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; - - sum2_c2 += in_ptr1[0] * pad_filter0_c2[0]; - sum2_c2 += in_ptr1[1] * pad_filter0_c2[1]; - sum2_c2 += in_ptr1[2] * pad_filter0_c2[2]; - sum2_c2 += in_ptr2[0] * pad_filter1_c2[0]; - sum2_c2 += in_ptr2[1] * pad_filter1_c2[1]; - sum2_c2 += in_ptr2[2] * pad_filter1_c2[2]; - sum2_c2 += in_ptr3[0] * pad_filter2_c2[0]; - sum2_c2 += in_ptr3[1] * pad_filter2_c2[1]; - sum2_c2 += in_ptr3[2] * pad_filter2_c2[2]; -#endif - } - if (!if_nopadding && - (o_w < padding_w || o_w > output_w - padding_w - 2)) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - pad_filter0_c2--; - pad_filter1_c2--; - pad_filter2_c2--; - pad_filter3_c2--; - } else { - in_ptr1++; - in_ptr2++; - in_ptr3++; - in_ptr4++; - } - *out_ptr1 += sum1; - *out_ptr2 += sum2; - *out_ptr1_c2 += sum1_c2; - *out_ptr2_c2 += sum2_c2; - - out_ptr1++; - out_ptr2++; - out_ptr1_c2++; - out_ptr2_c2++; - } - if (if_nopadding) { - in_ptr1 += 2 + input_w; - in_ptr2 += 2 + input_w; - in_ptr3 += 2 + input_w; - in_ptr4 += 2 + input_w; - } else if (o_h == padding_h - 1 || o_h == output_h - padding_h - 2) { - in_ptr1 += 3; - in_ptr2 += 3; - in_ptr3 += 3; - in_ptr4 += 3; - - pad_filter0 -= 2; - pad_filter1 -= 2; - pad_filter2 -= 2; - pad_filter3 -= 2; - - pad_filter0_c2 -= 2; - pad_filter1_c2 -= 2; - pad_filter2_c2 -= 2; - pad_filter3_c2 -= 2; - - } else if (issamefilter) { - in_ptr1 += 3 + input_w; - in_ptr2 += 3 + input_w; - in_ptr3 += 3 + input_w; - in_ptr4 += 3 + input_w; - - pad_filter0 += 2 * padding_w + 1; - pad_filter1 += 2 * padding_w + 1; - pad_filter2 += 2 * padding_w + 1; - pad_filter3 += 2 * padding_w + 1; - - pad_filter0_c2 += 2 * padding_w + 1; - pad_filter1_c2 += 2 * padding_w + 1; - pad_filter2_c2 += 2 * padding_w + 1; - pad_filter3_c2 += 2 * padding_w + 1; - - } else { - pad_filter0 -= 3 + 2 * padding_w + 2; - pad_filter1 -= 3 + 2 * padding_w + 2; - pad_filter2 -= 3 + 2 * padding_w + 2; - pad_filter3 -= 3 + 2 * padding_w + 2; - - pad_filter0_c2 -= 3 + 2 * padding_w + 2; - pad_filter1_c2 -= 3 + 2 * padding_w + 2; - pad_filter2_c2 -= 3 + 2 * padding_w + 2; - pad_filter3_c2 -= 3 + 2 * padding_w + 2; - - in_ptr1 -= input_w - 3; - in_ptr2 -= input_w - 3; - in_ptr3 -= input_w - 3; - in_ptr4 -= input_w - 3; - } - out_ptr1 += output_w; - out_ptr2 += output_w; - out_ptr1_c2 += output_w; - out_ptr2_c2 += output_w; - } - // remain output_height - for (; o_h < output_h; ++o_h) { - int o_w = 0; - // pad left - for (; o_w < padding_w; ++o_w) { - float sum1 = 0; - float sum1_c2 = 0; -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss1_2 = - vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); - float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2); - - sum1 += vget_lane_f32(_ssss1_ssss1_2, 0); - sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; - sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; - sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; - sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; - sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; - sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; - sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; - sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; - sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; -#endif - if (!if_nopadding && - (o_w < padding_w || o_w > output_w - padding_w - 2)) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - pad_filter0_c2--; - pad_filter1_c2--; - pad_filter2_c2--; - pad_filter3_c2--; - } else { - in_ptr1++; - in_ptr2++; - in_ptr3++; - in_ptr4++; - } - *out_ptr1 += sum1; - *out_ptr1_c2 += sum1_c2; - - out_ptr1++; - out_ptr1_c2++; - } -// valid -#if __ARM_NEON -#if __aarch64__ - if (if_nopadding) { - int loop = (output_w - 2 * padding_w) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - "prfm pldl1keep, [%[f1], #256] \n\t" - "prfm pldl1keep, [%[f1_c2], #256] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[f1]] \n\t" - "add %[f1], %[f1], #32 \n\t" - "ld1 {v2.4s, v3.4s}, [%[f1_c2]] \n\t" - "add %[f1_c2], %[f1_c2], #32 \n\t" - - "ld1 {v4.s}[0], [%[f1]] \n\t" - "sub %[f1],%[f1], #32 \n\t" - "ld1 {v4.s}[1], [%[f1_c2]] \n\t" - "sub %[f1_c2],%[f1_c2], #32 \n\t" - - "0: \n\t" - // load out_ptr - "prfm pldl1keep, [%[out_ptr1], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c2], #128] \n\t" - - "ld1 {v12.4s}, [%[out_ptr1]] \n\t" - "ld1 {v13.4s}, [%[out_ptr1_c2]] \n\t" - - // in_ptr1 multiply - "prfm pldl1keep, [%[in_ptr1], #192] \n\t" - "ld1 {v5.4s, v6.4s}, [%[in_ptr1]] \n\t" - "add %[in_ptr1],%[in_ptr1], #16 \n\t" - - "ext v8.16b, v5.16b, v6.16b, #4 \n\t" - "fmla v12.4s, v5.4s, v0.s[0] \n\t" - "fmla v13.4s, v5.4s, v2.s[0] \n\t" - - "ext v10.16b, v5.16b, v6.16b, #8 \n\t" - "fmla v12.4s, v8.4s, v0.s[1] \n\t" - "fmla v13.4s, v8.4s, v2.s[1] \n\t" - - "ld1 {v5.4s, v6.4s}, [%[in_ptr2]] \n\t" - "add %[in_ptr2],%[in_ptr2], #16 \n\t" - "fmla v12.4s, v10.4s, v0.s[2] \n\t" - "fmla v13.4s, v10.4s, v2.s[2] \n\t" - - // in_ptr2 multiply - "ext v8.16b, v5.16b, v6.16b, #4 \n\t" - "fmla v12.4s, v5.4s, v0.s[3] \n\t" - "fmla v13.4s, v5.4s, v2.s[3] \n\t" - - "ext v9.16b, v5.16b, v6.16b, #8 \n\t" - "fmla v12.4s, v8.4s, v1.s[0] \n\t" - "fmla v13.4s, v8.4s, v3.s[0] \n\t" - - "ld1 {v6.d}[1], [%[in_ptr3]] \n\t" - "add %[in_ptr3],%[in_ptr3], #8 \n\t" - "ld1 {v7.4s}, [%[in_ptr3]] \n\t" - "add %[in_ptr3],%[in_ptr3], #8 \n\t" - - "fmla v12.4s, v9.4s, v1.s[1] \n\t" - "fmla v13.4s, v9.4s, v3.s[1] \n\t" - - // in_ptr3 multiply - "ext v10.16b, v6.16b, v7.16b, #8 \n\t" - "fmla v12.4s, v7.4s, v4.s[0] \n\t" - "fmla v13.4s, v7.4s, v4.s[1] \n\t" - - "ext v11.16b, v6.16b, v7.16b, #12 \n\t" - "fmla v12.4s, v10.4s, v1.s[2] \n\t" - "fmla v13.4s, v10.4s, v3.s[2] \n\t" - - "fmla v12.4s, v11.4s, v1.s[3] \n\t" - "fmla v13.4s, v11.4s, v3.s[3] \n\t" - - // store out_ptr - "st1 {v12.4s}, [%[out_ptr1]], #16 \n\t" - "st1 {v13.4s}, [%[out_ptr1_c2]], #16 \n\t" - - // cycle - "subs %[loop],%[loop], #1 \n\t" - "bne 0b \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2), - [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1), - [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), - [in_ptr4] "+r"(in_ptr4) - : [f1] "r"(f1), [f1_c2] "r"(f1_c2) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7", "v8", "v9", "v10", "v11", "v12", "v13"); - } - } -#else - if (if_nopadding) { - int loop = (output_w - 2 * padding_w) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - "pld [%[f1], #256] \n\t" - "pld [%[f1_c2], #256] \n\t" - - "vld1.f32 {d0-d3}, [%[f1]] \n\t" - "add %[f1], #32 \n\t" - "vld1.f32 {d4-d7}, [%[f1_c2]] \n\t" - "add %[f1_c2], #32 \n\t" - - "vld1.f32 {d8[0]}, [%[f1]] \n\t" - "sub %[f1], #32 \n\t" - "vld1.f32 {d8[1]}, [%[f1_c2]] \n\t" - "sub %[f1_c2], #32 \n\t" - - "0: \n\t" - // load out_ptr - "pld [%[out_ptr1], #128] \n\t" - "pld [%[out_ptr1_c2], #128] \n\t" - - "vld1.f32 {d24, d25}, [%[out_ptr1]] \n\t" - "vld1.f32 {d26, d27}, [%[out_ptr1_c2]] \n\t" - - // in_ptr1 multiply - "pld [%[in_ptr1], #128] \n\t" - - "vld1.f32 {d10-d12}, [%[in_ptr1]] \n\t" - "add %[in_ptr1], #16 \n\t" - "vext.32 q8, q5, q6, #1 \n\t" - - "pld [%[in_ptr2], #128] \n\t" - "vmla.f32 q12, q5, d0[0] \n\t" - "vmla.f32 q13, q5, d4[0] \n\t" - - "vext.32 q10, q5, q6, #2 \n\t" - "vld1.f32 {d10-d12}, [%[in_ptr2]] \n\t" - "add %[in_ptr2], #16 \n\t" - "vmla.f32 q12, q8, d0[1] \n\t" - "vmla.f32 q13, q8, d4[1] \n\t" - - "vmla.f32 q12, q10, d1[0] \n\t" - "vmla.f32 q13, q10, d5[0] \n\t" - - // in_ptr2 multiply - "vext.32 q8, q5, q6, #1 \n\t" - "pld [%[in_ptr3], #128] \n\t" - "vmla.f32 q12, q5, d1[1] \n\t" - "vmla.f32 q13, q5, d5[1] \n\t" - - "vext.32 q9, q5, q6, #2 \n\t" - "vld1.f32 {d13-d15}, [%[in_ptr3]] \n\t" - "add %[in_ptr3], #16 \n\t" - "vmla.f32 q12, q8, d2[0] \n\t" - "vmla.f32 q13, q8, d6[0] \n\t" - - "vmla.f32 q12, q9, d2[1] \n\t" - "vmla.f32 q13, q9, d6[1] \n\t" - - // in_ptr3 multiply - "vext.32 q10, q6, q7, #2 \n\t" - "vmla.f32 q12, q7, d8[0] \n\t" - "vmla.f32 q13, q7, d8[1] \n\t" - - "vext.32 q11, q6, q7, #3 \n\t" - "vmla.f32 q12, q10, d3[0] \n\t" - "vmla.f32 q13, q10, d7[0] \n\t" - - "vmla.f32 q12, q11, d3[1] \n\t" - "vmla.f32 q13, q11, d7[1] \n\t" - - // store out_ptr - "subs %[loop], #1 \n\t" - "vst1.f32 {d24, d25}, [%[out_ptr1]]! \n\t" - "vst1.f32 {d26, d27}, [%[out_ptr1_c2]]! \n\t" - - // cycle - "bne 0b \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2), - [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1), - [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), - [in_ptr4] "+r"(in_ptr4) - : [f1] "r"(f1), [f1_c2] "r"(f1_c2) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q8", "q9", "q10", "q11", "q12", "q13"); - } - } - -#endif // __aarch64__ -#endif // __ARM_NEON - - // remain output_width - for (; o_w < output_w; ++o_w) { - float sum1 = 0; - float sum1_c2 = 0; -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss1_2 = - vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); - - float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2); - sum1 += vget_lane_f32(_ssss1_ssss1_2, 0); - sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; - sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; - sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; - sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; - sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; - sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; - sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; - sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; - sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; -#endif - if (!if_nopadding && - (o_w < padding_w || o_w > output_w - padding_w - 2)) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - pad_filter0_c2--; - pad_filter1_c2--; - pad_filter2_c2--; - pad_filter3_c2--; - } else { - in_ptr1++; - in_ptr2++; - in_ptr3++; - in_ptr4++; - } - *out_ptr1 += sum1; - *out_ptr1_c2 += sum1_c2; - - out_ptr1++; - out_ptr1_c2++; - } - out_ptr1 += output_w; - out_ptr1_c2 += output_w; - } - filter_data_ch += filter_ch_size; - filter_data_ch_c2 += filter_ch_size; - input_data_ch += in_ch_size; - } - } - - int out_ch_remain_start = output_ch - output_ch % 2; - // remain output_channel - for (int o_c = out_ch_remain_start; o_c < output_ch; ++o_c) { - bool issamefilter; - const float *in_ptr1, *in_ptr2, *in_ptr3, *in_ptr4; - const float *f1; - const float *pad_filter0, *pad_filter1, *pad_filter2, *pad_filter3; - float pad_filter_arr[pad_filter_ch_size]; - float *output_data_ch; - const float *input_data_ch; - const float *filter_data_ch; - - input_data_ch = input_data; - output_data_ch = output_data + o_c * out_ch_size; - filter_data_ch = filter_data + o_c * filter_ch_size * input_ch; - - for (int i_c = 0; i_c < input_ch; ++i_c) { - f1 = filter_data_ch; - if (!if_nopadding) { - memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr)); - for (int i = 0; i < 9; ++i) { - int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 + - padding_w * (2 * padding_h + 1); - pad_filter_arr[j] = filter_data_ch[i]; - } - pad_filter1 = pad_filter_arr; - pad_filter1 += pad_filter_start; - pad_filter0 = pad_filter1 - pad_filter_w; - pad_filter2 = pad_filter1 + pad_filter_w; - pad_filter3 = pad_filter2 + pad_filter_w; - - } else { - pad_filter1 = filter_data_ch; - pad_filter2 = pad_filter1 + 3; - pad_filter3 = pad_filter2 + 3; - } - float *out_ptr1, *out_ptr2; - out_ptr1 = output_data_ch; - out_ptr2 = out_ptr1 + output_w; - - in_ptr1 = input_data_ch; - in_ptr2 = in_ptr1 + input_w; - in_ptr3 = in_ptr2 + input_w; - in_ptr4 = in_ptr3 + input_w; - - int o_h = 0; - for (; o_h < output_h - 1; o_h = o_h + 2) { - if (!if_nopadding && - (o_h < padding_h || o_h > output_h - padding_h - 2)) { - issamefilter = false; - } else { - issamefilter = true; - } - int o_w = 0; - // pad left - for (; o_w < padding_w; ++o_w) { - float sum1 = 0; - float sum2 = 0; - - if (issamefilter) { -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); - - float32x4_t _in_ptr4 = vld1q_f32(in_ptr4); - _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum2 = vsetq_lane_f32(sum2, _sum2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); - - sum1 += vget_lane_f32(_ssss1_ssss2, 0); - sum2 += vget_lane_f32(_ssss1_ssss2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum2 += in_ptr2[0] * pad_filter1[0]; - sum2 += in_ptr2[1] * pad_filter1[1]; - sum2 += in_ptr2[2] * pad_filter1[2]; - sum2 += in_ptr3[0] * pad_filter2[0]; - sum2 += in_ptr3[1] * pad_filter2[1]; - sum2 += in_ptr3[2] * pad_filter2[2]; - sum2 += in_ptr4[0] * pad_filter3[0]; - sum2 += in_ptr4[1] * pad_filter3[1]; - sum2 += in_ptr4[2] * pad_filter3[2]; -#endif - } else { -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter0 = vld1q_f32(pad_filter0); - - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0); - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum2 = vsetq_lane_f32(sum2, _sum2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); - - sum1 += vget_lane_f32(_ssss1_ssss2, 0); - sum2 += vget_lane_f32(_ssss1_ssss2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum2 += in_ptr1[0] * pad_filter0[0]; - sum2 += in_ptr1[1] * pad_filter0[1]; - sum2 += in_ptr1[2] * pad_filter0[2]; - sum2 += in_ptr2[0] * pad_filter1[0]; - sum2 += in_ptr2[1] * pad_filter1[1]; - sum2 += in_ptr2[2] * pad_filter1[2]; - sum2 += in_ptr3[0] * pad_filter2[0]; - sum2 += in_ptr3[1] * pad_filter2[1]; - sum2 += in_ptr3[2] * pad_filter2[2]; -#endif - } - if (!if_nopadding && - (o_w < padding_w || o_w > output_w - padding_w - 2)) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - } else { - in_ptr1++; - in_ptr2++; - in_ptr3++; - in_ptr4++; - } - *out_ptr1 += sum1; - *out_ptr2 += sum2; - - out_ptr1++; - out_ptr2++; - } - // valid -#if __ARM_NEON -#if __aarch64__ - if (issamefilter) { - int loop = (output_w - 2 * padding_w) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - "prfm pldl1keep, [%[f1], #256] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[f1]] \n\t" - "add %[f1], %[f1], #32 \n\t" - - "ld1 {v4.s}[0], [%[f1]] \n\t" - "sub %[f1],%[f1], #32 \n\t" - - "0: \n\t" - // load out_ptr - "prfm pldl1keep, [%[out_ptr1], #128] \n\t" - "prfm pldl1keep, [%[out_ptr2], #128] \n\t" - - "ld1 {v12.4s}, [%[out_ptr1]] \n\t" - "ld1 {v14.4s}, [%[out_ptr2]] \n\t" - - // in_ptr1 + in_ptr4 multiply - "prfm pldl1keep, [%[in_ptr1], #192] \n\t" - "prfm pldl1keep, [%[in_ptr4], #192] \n\t" - - "ld1 {v5.4s, v6.4s}, [%[in_ptr1]] \n\t" - "add %[in_ptr1],%[in_ptr1], #16 \n\t" - - "ld1 {v6.d}[1], [%[in_ptr4]] \n\t" - "add %[in_ptr4],%[in_ptr4], #8 \n\t" - "ld1 {v7.4s}, [%[in_ptr4]] \n\t" - "add %[in_ptr4],%[in_ptr4], #8 \n\t" - - "ext v8.16b, v5.16b, v6.16b, #4 \n\t" - "fmla v12.4s, v5.4s, v0.s[0] \n\t" - - "ext v9.16b, v6.16b, v7.16b, #8 \n\t" - "fmla v14.4s, v7.4s, v4.s[0] \n\t" - - "ext v10.16b, v5.16b, v6.16b, #8 \n\t" - "fmla v12.4s, v8.4s, v0.s[1] \n\t" - - "ext v11.16b, v6.16b, v7.16b, #12 \n\t" - "fmla v14.4s, v9.4s, v1.s[2] \n\t" - - "ld1 {v5.4s, v6.4s}, [%[in_ptr2]] \n\t" - "add %[in_ptr2],%[in_ptr2], #16 \n\t" - - "fmla v12.4s, v10.4s, v0.s[2] \n\t" - "fmla v14.4s, v11.4s, v1.s[3] \n\t" - - // in_ptr2 multiply - "ext v8.16b, v5.16b, v6.16b, #4 \n\t" - "fmla v12.4s, v5.4s, v0.s[3] \n\t" - "fmla v14.4s, v5.4s, v0.s[0] \n\t" - - "ext v9.16b, v5.16b, v6.16b, #8 \n\t" - "fmla v12.4s, v8.4s, v1.s[0] \n\t" - "fmla v14.4s, v8.4s, v0.s[1] \n\t" - - "ld1 {v6.d}[1], [%[in_ptr3]] \n\t" - "add %[in_ptr3],%[in_ptr3], #8 \n\t" - "ld1 {v7.4s}, [%[in_ptr3]] \n\t" - - "add %[in_ptr3],%[in_ptr3], #8 \n\t" - "fmla v12.4s, v9.4s, v1.s[1] \n\t" - "fmla v14.4s, v9.4s, v0.s[2] \n\t" - - // in_ptr3 multiply - "ext v10.16b, v6.16b, v7.16b, #8 \n\t" - "fmla v12.4s, v7.4s, v4.s[0] \n\t" - "fmla v14.4s, v7.4s, v1.s[1] \n\t" - - "ext v11.16b, v6.16b, v7.16b, #12 \n\t" - "fmla v12.4s, v10.4s, v1.s[2] \n\t" - "fmla v14.4s, v10.4s, v0.s[3] \n\t" - - "fmla v12.4s, v11.4s, v1.s[3] \n\t" - "fmla v14.4s, v11.4s, v1.s[0] \n\t" - - // store out_ptr - "st1 {v12.4s}, [%[out_ptr1]], #16 \n\t" - "st1 {v14.4s}, [%[out_ptr2]], #16 \n\t" - - // cycle - "subs %[loop],%[loop], #1 \n\t" - "bne 0b \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [out_ptr2] "+r"(out_ptr2), [in_ptr1] "+r"(in_ptr1), - [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), - [in_ptr4] "+r"(in_ptr4) - : [f1] "r"(f1) - : "cc", "memory", "v0", "v1", "v4", "v5", "v6", "v7", "v8", - "v9", "v10", "v11", "v12", "v14"); - } - } - if (!if_nopadding && o_w == output_w - padding_w) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - in_ptr1--; - in_ptr2--; - in_ptr3--; - in_ptr4--; - } -#else - if (issamefilter) { - int loop = (output_w - 2 * padding_w) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - "pld [%[f1], #256] \n\t" - "vld1.f32 {d0-d3}, [%[f1]] \n\t" - "add %[f1], #32 \n\t" - - "vld1.f32 {d8[0]}, [%[f1]] \n\t" - "sub %[f1], #32 \n\t" - - "0: \n\t" - // load out_ptr - "pld [%[out_ptr1], #128] \n\t" - "pld [%[out_ptr2], #128] \n\t" - - "vld1.f32 {d24, d25}, [%[out_ptr1]] \n\t" - "vld1.f32 {d28, d29}, [%[out_ptr2]] \n\t" - - // in_ptr1 + in_ptr4 multiply - "pld [%[in_ptr1], #192] \n\t" - "pld [%[in_ptr4], #192] \n\t" - - "vld1.f32 {d10-d12}, [%[in_ptr1]] \n\t" - "add %[in_ptr1], #16 \n\t" - - "vld1.f32 {d13-d15}, [%[in_ptr4]] \n\t" - "add %[in_ptr4], #16 \n\t" - - "vext.32 q8, q5, q6, #1 \n\t" - "vmla.f32 q12, q5, d0[0] \n\t" - - "vext.32 q9, q6, q7, #2 \n\t" - "vmla.f32 q14, q7, d8[0] \n\t" - - "vext.32 q10, q5, q6, #2 \n\t" - "vmla.f32 q12, q8, d0[1] \n\t" - - "vext.32 q11, q6, q7, #3 \n\t" - "vmla.f32 q14, q9, d3[0] \n\t" - - "vld1.f32 {d10-d12}, [%[in_ptr2]] \n\t" - "add %[in_ptr2], #16 \n\t" - - "vmla.f32 q12, q10, d1[0] \n\t" - "vmla.f32 q14, q11, d3[1] \n\t" - - // in_ptr2 multiply - "vext.32 q8, q5, q6, #1 \n\t" - "vmla.f32 q12, q5, d1[1] \n\t" - "vmla.f32 q14, q5, d0[0] \n\t" - - "vext.32 q9, q5, q6, #2 \n\t" - "vmla.f32 q12, q8, d2[0] \n\t" - "vmla.f32 q14, q8, d0[1] \n\t" - - "vld1.f32 {d13-d15}, [%[in_ptr3]] \n\t" - "add %[in_ptr3], #16 \n\t" - - "vmla.f32 q12, q9, d2[1] \n\t" - "vmla.f32 q14, q9, d1[0] \n\t" - - // in_ptr3 multiply - "vext.32 q10, q6, q7, #2 \n\t" - "vmla.f32 q12, q7, d8[0] \n\t" - "vmla.f32 q14, q7, d2[1] \n\t" - - "vext.32 q11, q6, q7, #3 \n\t" - "vmla.f32 q12, q10, d3[0] \n\t" - "vmla.f32 q14, q10, d1[1] \n\t" - - "vmla.f32 q12, q11, d3[1] \n\t" - "vmla.f32 q14, q11, d2[0] \n\t" - - // store out_ptr - "subs %[loop], #1 \n\t" - "vst1.f32 {d24, d25}, [%[out_ptr1]]! \n\t" - "vst1.f32 {d28, d29}, [%[out_ptr2]]! \n\t" - - // cycle - "bne 0b \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [out_ptr2] "+r"(out_ptr2), [in_ptr1] "+r"(in_ptr1), - [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), - [in_ptr4] "+r"(in_ptr4) - : [f1] "r"(f1) - : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q14"); - } - } - if (!if_nopadding && o_w == output_w - padding_w) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - in_ptr1--; - in_ptr2--; - in_ptr3--; - in_ptr4--; - } -#endif // __aarch64__ -#endif // __ARM_NEON - - // remain output_width - for (; o_w < output_w; ++o_w) { - float sum1 = 0; - float sum2 = 0; - - if (issamefilter) { -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); - - float32x4_t _in_ptr4 = vld1q_f32(in_ptr4); - _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum2 = vsetq_lane_f32(sum2, _sum2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); - - sum1 += vget_lane_f32(_ssss1_ssss2, 0); - sum2 += vget_lane_f32(_ssss1_ssss2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum2 += in_ptr2[0] * pad_filter1[0]; - sum2 += in_ptr2[1] * pad_filter1[1]; - sum2 += in_ptr2[2] * pad_filter1[2]; - sum2 += in_ptr3[0] * pad_filter2[0]; - sum2 += in_ptr3[1] * pad_filter2[1]; - sum2 += in_ptr3[2] * pad_filter2[2]; - sum2 += in_ptr4[0] * pad_filter3[0]; - sum2 += in_ptr4[1] * pad_filter3[1]; - sum2 += in_ptr4[2] * pad_filter3[2]; -#endif - } else { -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter0 = vld1q_f32(pad_filter0); - - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0); - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1); - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2); - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum2 = vsetq_lane_f32(sum2, _sum2, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2); - - sum1 += vget_lane_f32(_ssss1_ssss2, 0); - sum2 += vget_lane_f32(_ssss1_ssss2, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum2 += in_ptr1[0] * pad_filter0[0]; - sum2 += in_ptr1[1] * pad_filter0[1]; - sum2 += in_ptr1[2] * pad_filter0[2]; - sum2 += in_ptr2[0] * pad_filter1[0]; - sum2 += in_ptr2[1] * pad_filter1[1]; - sum2 += in_ptr2[2] * pad_filter1[2]; - sum2 += in_ptr3[0] * pad_filter2[0]; - sum2 += in_ptr3[1] * pad_filter2[1]; - sum2 += in_ptr3[2] * pad_filter2[2]; -#endif - } - if (!if_nopadding && - (o_w < padding_w || o_w > output_w - padding_w - 2)) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - } else { - in_ptr1++; - in_ptr2++; - in_ptr3++; - in_ptr4++; - } - *out_ptr1 += sum1; - *out_ptr2 += sum2; - - out_ptr1++; - out_ptr2++; - } - if (if_nopadding) { - in_ptr1 += 2 + input_w; - in_ptr2 += 2 + input_w; - in_ptr3 += 2 + input_w; - in_ptr4 += 2 + input_w; - } else if (o_h == padding_h - 1 || o_h == output_h - padding_h - 2) { - in_ptr1 += 3; - in_ptr2 += 3; - in_ptr3 += 3; - in_ptr4 += 3; - - pad_filter0 -= 2; - pad_filter1 -= 2; - pad_filter2 -= 2; - pad_filter3 -= 2; - - } else if (issamefilter) { - in_ptr1 += 3 + input_w; - in_ptr2 += 3 + input_w; - in_ptr3 += 3 + input_w; - in_ptr4 += 3 + input_w; - - pad_filter0 += 2 * padding_w + 1; - pad_filter1 += 2 * padding_w + 1; - pad_filter2 += 2 * padding_w + 1; - pad_filter3 += 2 * padding_w + 1; - - } else { - pad_filter0 -= 3 + 2 * padding_w + 2; - pad_filter1 -= 3 + 2 * padding_w + 2; - pad_filter2 -= 3 + 2 * padding_w + 2; - pad_filter3 -= 3 + 2 * padding_w + 2; - - in_ptr1 -= input_w - 3; - in_ptr2 -= input_w - 3; - in_ptr3 -= input_w - 3; - in_ptr4 -= input_w - 3; - } - out_ptr1 += output_w; - out_ptr2 += output_w; - } - - // remain output_height - for (; o_h < output_h; ++o_h) { - for (int o_w = 0; o_w < output_w; ++o_w) { - float sum1 = 0; -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1); - sum1 += vget_lane_f32(_ssss1_ssss1, 0); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; -#endif - if (!if_nopadding && - (o_w < padding_w || o_w > output_w - padding_w - 2)) { - pad_filter0--; - pad_filter1--; - pad_filter2--; - pad_filter3--; - - } else { - in_ptr1++; - in_ptr2++; - in_ptr3++; - in_ptr4++; - } - *out_ptr1 += sum1; - out_ptr1++; - } - out_ptr1 += output_w; - } - filter_data_ch += filter_ch_size; - input_data_ch += in_ch_size; - } - } - input_data += in_batch_size; - output_data += out_batch_size; - } -} - -template <> -void SlidingwindowConv3x3s2(const framework::Tensor *input, - const framework::Tensor *filter, - const std::vector &paddings, - framework::Tensor *output) { - const int batch = input->dims()[0]; - const int input_ch = input->dims()[1]; - const int input_h = input->dims()[2]; - const int input_w = input->dims()[3]; - const int output_ch = output->dims()[1]; - const int output_h = output->dims()[2]; - const int output_w = output->dims()[3]; - const int padding_h = paddings[0]; - const int padding_w = paddings[1]; - - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - const float *filter_data = filter->data(); - - const int in_ch_size = input_h * input_w; - const int in_batch_size = input_ch * in_ch_size; - const int out_ch_size = output_h * output_w; - const int out_batch_size = output_ch * out_ch_size; - const int out_size = batch * out_batch_size; - const int filter_ch_size = 9; - const int pad_filter_ch_size = (2 * padding_h + 3) * (2 * padding_w + 3); - const int pad_filter_start = - 2 * padding_h * (2 * padding_w + 3) + 2 * padding_w; - const int pad_filter_w = 3 + padding_w * 2; - - bool if_nopadding = false; - const bool if_exact_in_w = (input_w + 2 * padding_w - 3) % 2 == 0; - const bool if_exact_in_h = (input_h + 2 * padding_h - 3) % 2 == 0; - const bool if_odd_pad_w = padding_w % 2 == 1; - const bool if_odd_pad_h = padding_h % 2 == 1; - - int valid_w_start = padding_w >> 1; - int valid_h_start = padding_h >> 1; - int valid_w_end = output_w - valid_w_start - 2; - int valid_h_end = output_h - valid_h_start - 2; - const int remain_stride_w = input_w + 2 * padding_w - 2 * output_w; -#if __ARM_NEON - float *out_ptr = output_data; - int remain = out_size & 0x3; - float32x4_t _zero = vdupq_n_f32(0.0); - - for (int i = 0; i < out_size; i += 4) { - vst1q_f32(out_ptr, _zero); - out_ptr += 4; - } - switch (remain) { - case 1: - vst1q_lane_f32(out_ptr, _zero, 0); - break; - case 2: - vst1_f32(out_ptr, vget_low_f32(_zero)); - break; - case 3: - vst1_f32(out_ptr, vget_low_f32(_zero)); - vst1q_lane_f32(out_ptr + 2, _zero, 0); - break; - } -#else -#pragma omp parallel for - for (int i = 0; i < out_size; ++i) { - output_data[i] = 0; - } -#endif - - if (padding_h == 0 && padding_w == 0) { - if_nopadding = true; - valid_w_start = -1; - valid_h_start = -1; - valid_w_end = output_w; - valid_h_end = output_h; - } - - for (int b = 0; b < batch; ++b) { -#pragma omp parallel for - for (int o_c = 0; o_c < output_ch - 7; o_c += 8) { - const float *f1; - const float *in_ptr1, *in_ptr2, *in_ptr3; - const float *pad_filter1, *pad_filter2, *pad_filter3; - const float *pad_filter1_c2, *pad_filter2_c2, *pad_filter3_c2; - const float *pad_filter1_c3, *pad_filter2_c3, *pad_filter3_c3; - const float *pad_filter1_c4, *pad_filter2_c4, *pad_filter3_c4; - const float *pad_filter1_c5, *pad_filter2_c5, *pad_filter3_c5; - const float *pad_filter1_c6, *pad_filter2_c6, *pad_filter3_c6; - const float *pad_filter1_c7, *pad_filter2_c7, *pad_filter3_c7; - const float *pad_filter1_c8, *pad_filter2_c8, *pad_filter3_c8; - - float reform_filter_arr[72]; - float pad_filter_arr[pad_filter_ch_size]; - float pad_filter_arr_c2[pad_filter_ch_size]; - float pad_filter_arr_c3[pad_filter_ch_size]; - float pad_filter_arr_c4[pad_filter_ch_size]; - float pad_filter_arr_c5[pad_filter_ch_size]; - float pad_filter_arr_c6[pad_filter_ch_size]; - float pad_filter_arr_c7[pad_filter_ch_size]; - float pad_filter_arr_c8[pad_filter_ch_size]; - - float *output_data_ch; - float *output_data_ch_2; - float *output_data_ch_3; - float *output_data_ch_4; - float *output_data_ch_5; - float *output_data_ch_6; - float *output_data_ch_7; - float *output_data_ch_8; - - const float *input_data_ch; - const float *filter_data_ch; - const float *filter_data_ch_c2; - const float *filter_data_ch_c3; - const float *filter_data_ch_c4; - const float *filter_data_ch_c5; - const float *filter_data_ch_c6; - const float *filter_data_ch_c7; - const float *filter_data_ch_c8; - - filter_data_ch = filter_data + o_c * filter_ch_size * input_ch; - filter_data_ch_c2 = filter_data + (o_c + 1) * filter_ch_size * input_ch; - filter_data_ch_c3 = filter_data + (o_c + 2) * filter_ch_size * input_ch; - filter_data_ch_c4 = filter_data + (o_c + 3) * filter_ch_size * input_ch; - filter_data_ch_c5 = filter_data + (o_c + 4) * filter_ch_size * input_ch; - filter_data_ch_c6 = filter_data + (o_c + 5) * filter_ch_size * input_ch; - filter_data_ch_c7 = filter_data + (o_c + 6) * filter_ch_size * input_ch; - filter_data_ch_c8 = filter_data + (o_c + 7) * filter_ch_size * input_ch; - - input_data_ch = input_data; - output_data_ch = output_data + o_c * out_ch_size; - output_data_ch_2 = output_data + (o_c + 1) * out_ch_size; - output_data_ch_3 = output_data + (o_c + 2) * out_ch_size; - output_data_ch_4 = output_data + (o_c + 3) * out_ch_size; - output_data_ch_5 = output_data + (o_c + 4) * out_ch_size; - output_data_ch_6 = output_data + (o_c + 5) * out_ch_size; - output_data_ch_7 = output_data + (o_c + 6) * out_ch_size; - output_data_ch_8 = output_data + (o_c + 7) * out_ch_size; - - for (int i_c = 0; i_c < input_ch; ++i_c) { - int k = 0; - for (int i = 0; i < 9; ++i) { - for (int j = 0; j < 8; ++j) { - reform_filter_arr[k++] = filter_data_ch[i + input_ch * 9 * j]; - } - } - - f1 = reform_filter_arr; - - if (!if_nopadding) { - memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr)); - memset(pad_filter_arr_c2, 0.f, sizeof(pad_filter_arr_c2)); - memset(pad_filter_arr_c3, 0.f, sizeof(pad_filter_arr_c3)); - memset(pad_filter_arr_c4, 0.f, sizeof(pad_filter_arr_c4)); - memset(pad_filter_arr_c5, 0.f, sizeof(pad_filter_arr_c5)); - memset(pad_filter_arr_c6, 0.f, sizeof(pad_filter_arr_c6)); - memset(pad_filter_arr_c7, 0.f, sizeof(pad_filter_arr_c7)); - memset(pad_filter_arr_c8, 0.f, sizeof(pad_filter_arr_c8)); - - for (int i = 0; i < 9; ++i) { - int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 + - padding_w * (2 * padding_h + 1); - pad_filter_arr[j] = filter_data_ch[i]; - pad_filter_arr_c2[j] = filter_data_ch_c2[i]; - pad_filter_arr_c3[j] = filter_data_ch_c3[i]; - pad_filter_arr_c4[j] = filter_data_ch_c4[i]; - pad_filter_arr_c5[j] = filter_data_ch_c5[i]; - pad_filter_arr_c6[j] = filter_data_ch_c6[i]; - pad_filter_arr_c7[j] = filter_data_ch_c7[i]; - pad_filter_arr_c8[j] = filter_data_ch_c8[i]; - } - - pad_filter1 = pad_filter_arr; - pad_filter1 += pad_filter_start; - pad_filter2 = pad_filter1 + pad_filter_w; - pad_filter3 = pad_filter2 + pad_filter_w; - - pad_filter1_c2 = pad_filter_arr_c2; - pad_filter1_c2 += pad_filter_start; - pad_filter2_c2 = pad_filter1_c2 + pad_filter_w; - pad_filter3_c2 = pad_filter2_c2 + pad_filter_w; - - pad_filter1_c3 = pad_filter_arr_c3; - pad_filter1_c3 += pad_filter_start; - pad_filter2_c3 = pad_filter1_c3 + pad_filter_w; - pad_filter3_c3 = pad_filter2_c3 + pad_filter_w; - - pad_filter1_c4 = pad_filter_arr_c4; - pad_filter1_c4 += pad_filter_start; - pad_filter2_c4 = pad_filter1_c4 + pad_filter_w; - pad_filter3_c4 = pad_filter2_c4 + pad_filter_w; - - pad_filter1_c5 = pad_filter_arr_c5; - pad_filter1_c5 += pad_filter_start; - pad_filter2_c5 = pad_filter1_c5 + pad_filter_w; - pad_filter3_c5 = pad_filter2_c5 + pad_filter_w; - - pad_filter1_c6 = pad_filter_arr_c6; - pad_filter1_c6 += pad_filter_start; - pad_filter2_c6 = pad_filter1_c6 + pad_filter_w; - pad_filter3_c6 = pad_filter2_c6 + pad_filter_w; - - pad_filter1_c7 = pad_filter_arr_c7; - pad_filter1_c7 += pad_filter_start; - pad_filter2_c7 = pad_filter1_c7 + pad_filter_w; - pad_filter3_c7 = pad_filter2_c7 + pad_filter_w; - - pad_filter1_c8 = pad_filter_arr_c8; - pad_filter1_c8 += pad_filter_start; - pad_filter2_c8 = pad_filter1_c8 + pad_filter_w; - pad_filter3_c8 = pad_filter2_c8 + pad_filter_w; - } else { - pad_filter1 = filter_data_ch; - pad_filter2 = pad_filter1 + 3; - pad_filter3 = pad_filter2 + 3; - - pad_filter1_c2 = filter_data_ch_c2; - pad_filter2_c2 = pad_filter1_c2 + 3; - pad_filter3_c2 = pad_filter2_c2 + 3; - - pad_filter1_c3 = filter_data_ch_c3; - pad_filter2_c3 = pad_filter1_c3 + 3; - pad_filter3_c3 = pad_filter2_c3 + 3; - - pad_filter1_c4 = filter_data_ch_c4; - pad_filter2_c4 = pad_filter1_c4 + 3; - pad_filter3_c4 = pad_filter2_c4 + 3; - - pad_filter1_c5 = filter_data_ch_c5; - pad_filter2_c5 = pad_filter1_c5 + 3; - pad_filter3_c5 = pad_filter2_c5 + 3; - - pad_filter1_c6 = filter_data_ch_c6; - pad_filter2_c6 = pad_filter1_c6 + 3; - pad_filter3_c6 = pad_filter2_c6 + 3; - - pad_filter1_c7 = filter_data_ch_c7; - pad_filter2_c7 = pad_filter1_c7 + 3; - pad_filter3_c7 = pad_filter2_c7 + 3; - - pad_filter1_c8 = filter_data_ch_c8; - pad_filter2_c8 = pad_filter1_c8 + 3; - pad_filter3_c8 = pad_filter2_c8 + 3; - } - float *out_ptr1; - float *out_ptr1_c2; - float *out_ptr1_c3; - float *out_ptr1_c4; - float *out_ptr1_c5; - float *out_ptr1_c6; - float *out_ptr1_c7; - float *out_ptr1_c8; - - out_ptr1 = output_data_ch; - out_ptr1_c2 = output_data_ch_2; - out_ptr1_c3 = output_data_ch_3; - out_ptr1_c4 = output_data_ch_4; - out_ptr1_c5 = output_data_ch_5; - out_ptr1_c6 = output_data_ch_6; - out_ptr1_c7 = output_data_ch_7; - out_ptr1_c8 = output_data_ch_8; - - in_ptr1 = input_data_ch; - in_ptr2 = in_ptr1 + input_w; - in_ptr3 = in_ptr2 + input_w; - - int o_h = 0; - - for (; o_h < output_h; ++o_h) { - int o_w = 0; - - // pad left - for (; o_w <= valid_w_start; ++o_w) { - float sum1 = 0; - float sum1_c2 = 0; - float sum1_c3 = 0; - float sum1_c4 = 0; - float sum1_c5 = 0; - float sum1_c6 = 0; - float sum1_c7 = 0; - float sum1_c8 = 0; -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); - float32x4_t _pad_filter1_c3 = vld1q_f32(pad_filter1_c3); - float32x4_t _pad_filter1_c4 = vld1q_f32(pad_filter1_c4); - float32x4_t _pad_filter1_c5 = vld1q_f32(pad_filter1_c5); - float32x4_t _pad_filter1_c6 = vld1q_f32(pad_filter1_c6); - float32x4_t _pad_filter1_c7 = vld1q_f32(pad_filter1_c7); - float32x4_t _pad_filter1_c8 = vld1q_f32(pad_filter1_c8); - - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); - float32x4_t _sum1_c3 = vmulq_f32(_in_ptr1, _pad_filter1_c3); - float32x4_t _sum1_c4 = vmulq_f32(_in_ptr1, _pad_filter1_c4); - float32x4_t _sum1_c5 = vmulq_f32(_in_ptr1, _pad_filter1_c5); - float32x4_t _sum1_c6 = vmulq_f32(_in_ptr1, _pad_filter1_c6); - float32x4_t _sum1_c7 = vmulq_f32(_in_ptr1, _pad_filter1_c7); - float32x4_t _sum1_c8 = vmulq_f32(_in_ptr1, _pad_filter1_c8); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); - float32x4_t _pad_filter2_c3 = vld1q_f32(pad_filter2_c3); - float32x4_t _pad_filter2_c4 = vld1q_f32(pad_filter2_c4); - float32x4_t _pad_filter2_c5 = vld1q_f32(pad_filter2_c5); - float32x4_t _pad_filter2_c6 = vld1q_f32(pad_filter2_c6); - float32x4_t _pad_filter2_c7 = vld1q_f32(pad_filter2_c7); - float32x4_t _pad_filter2_c8 = vld1q_f32(pad_filter2_c8); - - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); - _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr2, _pad_filter2_c3); - _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr2, _pad_filter2_c4); - _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr2, _pad_filter2_c5); - _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr2, _pad_filter2_c6); - _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr2, _pad_filter2_c7); - _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr2, _pad_filter2_c8); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); - float32x4_t _pad_filter3_c3 = vld1q_f32(pad_filter3_c3); - float32x4_t _pad_filter3_c4 = vld1q_f32(pad_filter3_c4); - float32x4_t _pad_filter3_c5 = vld1q_f32(pad_filter3_c5); - float32x4_t _pad_filter3_c6 = vld1q_f32(pad_filter3_c6); - float32x4_t _pad_filter3_c7 = vld1q_f32(pad_filter3_c7); - float32x4_t _pad_filter3_c8 = vld1q_f32(pad_filter3_c8); - - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); - _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr3, _pad_filter3_c3); - _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr3, _pad_filter3_c4); - _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr3, _pad_filter3_c5); - _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr3, _pad_filter3_c6); - _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr3, _pad_filter3_c7); - _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr3, _pad_filter3_c8); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); - _sum1_c3 = vsetq_lane_f32(sum1_c3, _sum1_c3, 3); - _sum1_c4 = vsetq_lane_f32(sum1_c4, _sum1_c4, 3); - _sum1_c5 = vsetq_lane_f32(sum1_c5, _sum1_c5, 3); - _sum1_c6 = vsetq_lane_f32(sum1_c6, _sum1_c6, 3); - _sum1_c7 = vsetq_lane_f32(sum1_c7, _sum1_c7, 3); - _sum1_c8 = vsetq_lane_f32(sum1_c8, _sum1_c8, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss1_2 = - vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); - float32x2_t _ss1_3 = - vadd_f32(vget_low_f32(_sum1_c3), vget_high_f32(_sum1_c3)); - float32x2_t _ss1_4 = - vadd_f32(vget_low_f32(_sum1_c4), vget_high_f32(_sum1_c4)); - float32x2_t _ss1_5 = - vadd_f32(vget_low_f32(_sum1_c5), vget_high_f32(_sum1_c5)); - float32x2_t _ss1_6 = - vadd_f32(vget_low_f32(_sum1_c6), vget_high_f32(_sum1_c6)); - float32x2_t _ss1_7 = - vadd_f32(vget_low_f32(_sum1_c7), vget_high_f32(_sum1_c7)); - float32x2_t _ss1_8 = - vadd_f32(vget_low_f32(_sum1_c8), vget_high_f32(_sum1_c8)); - - float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2); - float32x2_t _ssss1_3_ssss1_4 = vpadd_f32(_ss1_3, _ss1_4); - float32x2_t _ssss1_5_ssss1_6 = vpadd_f32(_ss1_5, _ss1_6); - float32x2_t _ssss1_7_ssss1_8 = vpadd_f32(_ss1_7, _ss1_8); - - sum1 += vget_lane_f32(_ssss1_ssss1_2, 0); - sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1); - sum1_c3 += vget_lane_f32(_ssss1_3_ssss1_4, 0); - sum1_c4 += vget_lane_f32(_ssss1_3_ssss1_4, 1); - sum1_c5 += vget_lane_f32(_ssss1_5_ssss1_6, 0); - sum1_c6 += vget_lane_f32(_ssss1_5_ssss1_6, 1); - sum1_c7 += vget_lane_f32(_ssss1_7_ssss1_8, 0); - sum1_c8 += vget_lane_f32(_ssss1_7_ssss1_8, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; - sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; - sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; - sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; - sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; - sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; - sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; - sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; - sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; - - sum1_c3 += in_ptr1[0] * pad_filter1_c3[0]; - sum1_c3 += in_ptr1[1] * pad_filter1_c3[1]; - sum1_c3 += in_ptr1[2] * pad_filter1_c3[2]; - sum1_c3 += in_ptr2[0] * pad_filter2_c3[0]; - sum1_c3 += in_ptr2[1] * pad_filter2_c3[1]; - sum1_c3 += in_ptr2[2] * pad_filter2_c3[2]; - sum1_c3 += in_ptr3[0] * pad_filter3_c3[0]; - sum1_c3 += in_ptr3[1] * pad_filter3_c3[1]; - sum1_c3 += in_ptr3[2] * pad_filter3_c3[2]; - - sum1_c4 += in_ptr1[0] * pad_filter1_c4[0]; - sum1_c4 += in_ptr1[1] * pad_filter1_c4[1]; - sum1_c4 += in_ptr1[2] * pad_filter1_c4[2]; - sum1_c4 += in_ptr2[0] * pad_filter2_c4[0]; - sum1_c4 += in_ptr2[1] * pad_filter2_c4[1]; - sum1_c4 += in_ptr2[2] * pad_filter2_c4[2]; - sum1_c4 += in_ptr3[0] * pad_filter3_c4[0]; - sum1_c4 += in_ptr3[1] * pad_filter3_c4[1]; - sum1_c4 += in_ptr3[2] * pad_filter3_c4[2]; - - sum1_c5 += in_ptr1[0] * pad_filter1_c5[0]; - sum1_c5 += in_ptr1[1] * pad_filter1_c5[1]; - sum1_c5 += in_ptr1[2] * pad_filter1_c5[2]; - sum1_c5 += in_ptr2[0] * pad_filter2_c5[0]; - sum1_c5 += in_ptr2[1] * pad_filter2_c5[1]; - sum1_c5 += in_ptr2[2] * pad_filter2_c5[2]; - sum1_c5 += in_ptr3[0] * pad_filter3_c5[0]; - sum1_c5 += in_ptr3[1] * pad_filter3_c5[1]; - sum1_c5 += in_ptr3[2] * pad_filter3_c5[2]; - - sum1_c6 += in_ptr1[0] * pad_filter1_c6[0]; - sum1_c6 += in_ptr1[1] * pad_filter1_c6[1]; - sum1_c6 += in_ptr1[2] * pad_filter1_c6[2]; - sum1_c6 += in_ptr2[0] * pad_filter2_c6[0]; - sum1_c6 += in_ptr2[1] * pad_filter2_c6[1]; - sum1_c6 += in_ptr2[2] * pad_filter2_c6[2]; - sum1_c6 += in_ptr3[0] * pad_filter3_c6[0]; - sum1_c6 += in_ptr3[1] * pad_filter3_c6[1]; - sum1_c6 += in_ptr3[2] * pad_filter3_c6[2]; - - sum1_c7 += in_ptr1[0] * pad_filter1_c7[0]; - sum1_c7 += in_ptr1[1] * pad_filter1_c7[1]; - sum1_c7 += in_ptr1[2] * pad_filter1_c7[2]; - sum1_c7 += in_ptr2[0] * pad_filter2_c7[0]; - sum1_c7 += in_ptr2[1] * pad_filter2_c7[1]; - sum1_c7 += in_ptr2[2] * pad_filter2_c7[2]; - sum1_c7 += in_ptr3[0] * pad_filter3_c7[0]; - sum1_c7 += in_ptr3[1] * pad_filter3_c7[1]; - sum1_c7 += in_ptr3[2] * pad_filter3_c7[2]; - - sum1_c8 += in_ptr1[0] * pad_filter1_c8[0]; - sum1_c8 += in_ptr1[1] * pad_filter1_c8[1]; - sum1_c8 += in_ptr1[2] * pad_filter1_c8[2]; - sum1_c8 += in_ptr2[0] * pad_filter2_c8[0]; - sum1_c8 += in_ptr2[1] * pad_filter2_c8[1]; - sum1_c8 += in_ptr2[2] * pad_filter2_c8[2]; - sum1_c8 += in_ptr3[0] * pad_filter3_c8[0]; - sum1_c8 += in_ptr3[1] * pad_filter3_c8[1]; - sum1_c8 += in_ptr3[2] * pad_filter3_c8[2]; -#endif - if (if_nopadding) { - in_ptr1 += 2; - in_ptr2 += 2; - in_ptr3 += 2; - - } else if (input_w > 3 && - (if_odd_pad_w && o_w == valid_w_start || - o_w == valid_w_end && if_odd_pad_w && if_exact_in_w || - o_w == valid_w_end + 1 && !if_odd_pad_w && - !if_exact_in_w)) { - pad_filter1--; - pad_filter2--; - pad_filter3--; - pad_filter1_c2--; - pad_filter2_c2--; - pad_filter3_c2--; - - pad_filter1_c3--; - pad_filter2_c3--; - pad_filter3_c3--; - pad_filter1_c4--; - pad_filter2_c4--; - pad_filter3_c4--; - - pad_filter1_c5--; - pad_filter2_c5--; - pad_filter3_c5--; - pad_filter1_c6--; - pad_filter2_c6--; - pad_filter3_c6--; - - pad_filter1_c7--; - pad_filter2_c7--; - pad_filter3_c7--; - pad_filter1_c8--; - pad_filter2_c8--; - pad_filter3_c8--; - - in_ptr1++; - in_ptr2++; - in_ptr3++; - - } else if (input_w <= 3 || o_w < valid_w_start || - o_w > valid_w_end) { - pad_filter1 -= 2; - pad_filter2 -= 2; - pad_filter3 -= 2; - pad_filter1_c2 -= 2; - pad_filter2_c2 -= 2; - pad_filter3_c2 -= 2; - - pad_filter1_c3 -= 2; - pad_filter2_c3 -= 2; - pad_filter3_c3 -= 2; - pad_filter1_c4 -= 2; - pad_filter2_c4 -= 2; - pad_filter3_c4 -= 2; - - pad_filter1_c5 -= 2; - pad_filter2_c5 -= 2; - pad_filter3_c5 -= 2; - pad_filter1_c6 -= 2; - pad_filter2_c6 -= 2; - pad_filter3_c6 -= 2; - - pad_filter1_c7 -= 2; - pad_filter2_c7 -= 2; - pad_filter3_c7 -= 2; - pad_filter1_c8 -= 2; - pad_filter2_c8 -= 2; - pad_filter3_c8 -= 2; - } else { - in_ptr1 += 2; - in_ptr2 += 2; - in_ptr3 += 2; - } - *out_ptr1 += sum1; - *out_ptr1_c2 += sum1_c2; - *out_ptr1_c3 += sum1_c3; - *out_ptr1_c4 += sum1_c4; - *out_ptr1_c5 += sum1_c5; - *out_ptr1_c6 += sum1_c6; - *out_ptr1_c7 += sum1_c7; - *out_ptr1_c8 += sum1_c8; - - out_ptr1++; - out_ptr1_c2++; - out_ptr1_c3++; - out_ptr1_c4++; - out_ptr1_c5++; - out_ptr1_c6++; - out_ptr1_c7++; - out_ptr1_c8++; - } - // valid -#if __ARM_NEON -#if __aarch64__ - if (o_h > valid_h_start && o_h <= valid_h_end) { - int loop = (valid_w_end - valid_w_start - 1) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - - "prfm pldl1keep, [%[f1], #256] \n\t" - "prfm pldl1keep, [%[in_ptr1], #288] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" - "ld2 {v4.4s, v5.4s}, [%[in_ptr1]], #32 \n\t" - "ld2 {v6.4s, v7.4s}, [%[in_ptr1]] \n\t" - "0: \n\t" - // load out_ptr - "prfm pldl1keep, [%[out_ptr1], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c2], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c3], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c4], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c5], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c6], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c7], #128] \n\t" - "prfm pldl1keep, [%[out_ptr1_c8], #128] \n\t" - - "ld1 {v8.4s}, [%[out_ptr1]] \n\t" - "ld1 {v9.4s}, [%[out_ptr1_c2]] \n\t" - "ld1 {v10.4s}, [%[out_ptr1_c3]] \n\t" - "ld1 {v11.4s}, [%[out_ptr1_c4]] \n\t" - "ld1 {v12.4s}, [%[out_ptr1_c5]] \n\t" - "ld1 {v13.4s}, [%[out_ptr1_c6]] \n\t" - "ld1 {v14.4s}, [%[out_ptr1_c7]] \n\t" - "ld1 {v15.4s}, [%[out_ptr1_c8]] \n\t" - - // in_ptr1 multiply - "prfm pldl1keep, [%[f1], #256] \n\t" - "ld1 {v2.4s, v3.4s}, [%[f1]], #32 \n\t" - "fmla v8.4s, v4.4s, v0.s[0] \n\t" - "fmla v9.4s, v4.4s, v0.s[1] \n\t" - "fmla v10.4s, v4.4s, v0.s[2] \n\t" - "fmla v11.4s, v4.4s, v0.s[3] \n\t" - - "fmla v12.4s, v4.4s, v1.s[0] \n\t" - "fmla v13.4s, v4.4s, v1.s[1] \n\t" - "fmla v14.4s, v4.4s, v1.s[2] \n\t" - "fmla v15.4s, v4.4s, v1.s[3] \n\t" - - "ext v7.16b, v4.16b, v6.16b, #4 \n\t" - "fmla v8.4s, v5.4s, v2.s[0] \n\t" - "fmla v9.4s, v5.4s, v2.s[1] \n\t" - "fmla v10.4s, v5.4s, v2.s[2] \n\t" - "fmla v11.4s, v5.4s, v2.s[3] \n\t" - - "prfm pldl1keep, [%[f1], #256] \n\t" - "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" - "fmla v12.4s, v5.4s, v3.s[0] \n\t" - "fmla v13.4s, v5.4s, v3.s[1] \n\t" - "fmla v14.4s, v5.4s, v3.s[2] \n\t" - "fmla v15.4s, v5.4s, v3.s[3] \n\t" - - "prfm pldl1keep, [%[in_ptr2], #288] \n\t" - "ld2 {v4.4s, v5.4s}, [%[in_ptr2]], #32 \n\t" - "fmla v8.4s, v7.4s, v0.s[0] \n\t" - "fmla v9.4s, v7.4s, v0.s[1] \n\t" - "fmla v10.4s, v7.4s, v0.s[2] \n\t" - "fmla v11.4s, v7.4s, v0.s[3] \n\t" - - "prfm pldl1keep, [%[f1], #256] \n\t" - "ld1 {v2.4s, v3.4s}, [%[f1]], #32 \n\t" - - "fmla v12.4s, v7.4s, v1.s[0] \n\t" - "fmla v13.4s, v7.4s, v1.s[1] \n\t" - "fmla v14.4s, v7.4s, v1.s[2] \n\t" - "fmla v15.4s, v7.4s, v1.s[3] \n\t" - - // in_ptr2 multiply - "ld2 {v6.4s, v7.4s}, [%[in_ptr2]] \n\t" - "fmla v8.4s, v4.4s, v2.s[0] \n\t" - "fmla v9.4s, v4.4s, v2.s[1] \n\t" - "fmla v10.4s, v4.4s, v2.s[2] \n\t" - "fmla v11.4s, v4.4s, v2.s[3] \n\t" - - "prfm pldl1keep, [%[f1], #256] \n\t" - "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" - "fmla v12.4s, v4.4s, v3.s[0] \n\t" - "fmla v13.4s, v4.4s, v3.s[1] \n\t" - "fmla v14.4s, v4.4s, v3.s[2] \n\t" - "fmla v15.4s, v4.4s, v3.s[3] \n\t" - - "ext v7.16b, v4.16b, v6.16b, #4 \n\t" - "fmla v8.4s, v5.4s, v0.s[0] \n\t" - "fmla v9.4s, v5.4s, v0.s[1] \n\t" - "fmla v10.4s, v5.4s, v0.s[2] \n\t" - "fmla v11.4s, v5.4s, v0.s[3] \n\t" - - "prfm pldl1keep, [%[f1], #256] \n\t" - "ld1 {v2.4s, v3.4s}, [%[f1]], #32 \n\t" - "fmla v12.4s, v5.4s, v1.s[0] \n\t" - "fmla v13.4s, v5.4s, v1.s[1] \n\t" - - "prfm pldl1keep, [%[f1], #256] \n\t" - "prfm pldl1keep, [%[in_ptr3], #288] \n\t" - "fmla v14.4s, v5.4s, v1.s[2] \n\t" - "fmla v15.4s, v5.4s, v1.s[3] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" - "ld2 {v4.4s, v5.4s}, [%[in_ptr3]], #32 \n\t" - "fmla v8.4s, v7.4s, v2.s[0] \n\t" - "fmla v9.4s, v7.4s, v2.s[1] \n\t" - "fmla v10.4s, v7.4s, v2.s[2] \n\t" - "fmla v11.4s, v7.4s, v2.s[3] \n\t" - - "fmla v12.4s, v7.4s, v3.s[0] \n\t" - "fmla v13.4s, v7.4s, v3.s[1] \n\t" - "fmla v14.4s, v7.4s, v3.s[2] \n\t" - "fmla v15.4s, v7.4s, v3.s[3] \n\t" - - // in_ptr3 multiply - "ld2 {v6.4s, v7.4s}, [%[in_ptr3]] \n\t" - "fmla v8.4s, v4.4s, v0.s[0] \n\t" - "fmla v9.4s, v4.4s, v0.s[1] \n\t" - "fmla v10.4s, v4.4s, v0.s[2] \n\t" - "fmla v11.4s, v4.4s, v0.s[3] \n\t" - - "prfm pldl1keep, [%[f1], #256] \n\t" - "ld1 {v2.4s, v3.4s}, [%[f1]], #32 \n\t" - "fmla v12.4s, v4.4s, v1.s[0] \n\t" - "fmla v13.4s, v4.4s, v1.s[1] \n\t" - "fmla v14.4s, v4.4s, v1.s[2] \n\t" - "fmla v15.4s, v4.4s, v1.s[3] \n\t" - - "ext v7.16b, v4.16b, v6.16b, #4 \n\t" - "fmla v8.4s, v5.4s, v2.s[0] \n\t" - "fmla v9.4s, v5.4s, v2.s[1] \n\t" - "fmla v10.4s, v5.4s, v2.s[2] \n\t" - "fmla v11.4s, v5.4s, v2.s[3] \n\t" - - "prfm pldl1keep, [%[f1], #256] \n\t" - "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" - "fmla v12.4s, v5.4s, v3.s[0] \n\t" - "fmla v13.4s, v5.4s, v3.s[1] \n\t" - "fmla v14.4s, v5.4s, v3.s[2] \n\t" - "fmla v15.4s, v5.4s, v3.s[3] \n\t" - - "sub %[f1], %[f1], #288 \n\t" - "fmla v8.4s, v7.4s, v0.s[0] \n\t" - "fmla v9.4s, v7.4s, v0.s[1] \n\t" - "fmla v10.4s, v7.4s, v0.s[2] \n\t" - "fmla v11.4s, v7.4s, v0.s[3] \n\t" - - "fmla v12.4s, v7.4s, v1.s[0] \n\t" - "fmla v13.4s, v7.4s, v1.s[1] \n\t" - "fmla v14.4s, v7.4s, v1.s[2] \n\t" - "fmla v15.4s, v7.4s, v1.s[3] \n\t" - - // store out_ptr - "prfm pldl1keep, [%[f1], #256] \n\t" - "prfm pldl1keep, [%[in_ptr1], #288] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[f1]], #32 \n\t" - - "ld2 {v4.4s, v5.4s}, [%[in_ptr1]], #32 \n\t" - "st1 {v8.4s}, [%[out_ptr1]], #16 \n\t" - "st1 {v9.4s}, [%[out_ptr1_c2]], #16 \n\t" - - "st1 {v10.4s}, [%[out_ptr1_c3]], #16 \n\t" - "st1 {v11.4s}, [%[out_ptr1_c4]], #16 \n\t" - - "st1 {v12.4s}, [%[out_ptr1_c5]], #16 \n\t" - "st1 {v13.4s}, [%[out_ptr1_c6]], #16 \n\t" - - "ld2 {v6.4s, v7.4s}, [%[in_ptr1]] \n\t" - "st1 {v14.4s}, [%[out_ptr1_c7]], #16 \n\t" - "subs %[loop], %[loop], #1 \n\t" - "st1 {v15.4s}, [%[out_ptr1_c8]], #16 \n\t" - - // cycle - "bne 0b \n\t" - "sub %[f1], %[in_ptr1], #32 \n\t" - "sub %[in_ptr1], %[in_ptr1], #32 \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [out_ptr1_c2] "+r"(out_ptr1_c2), - [out_ptr1_c3] "+r"(out_ptr1_c3), - [out_ptr1_c4] "+r"(out_ptr1_c4), - [out_ptr1_c5] "+r"(out_ptr1_c5), - [out_ptr1_c6] "+r"(out_ptr1_c6), - [out_ptr1_c7] "+r"(out_ptr1_c7), - [out_ptr1_c8] "+r"(out_ptr1_c8), [in_ptr1] "+r"(in_ptr1), - [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3) - : [f1] "r"(f1) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); - } - } -#else - if (o_h > valid_h_start && o_h <= valid_h_end) { - int loop = (valid_w_end - valid_w_start - 1) >> 2; - o_w += loop * 4; - int in_stride = (input_w - 8) * 4; - - if (loop > 0) { - asm volatile( - - "pld [%[f1], #256] \n\t" - "pld [%[in_ptr1], #288] \n\t" - - "vld1.f32 {d0-d3}, [%[f1]]! \n\t" - "vld2.f32 {d8-d11}, [%[in_ptr1]]! \n\t" - "vld2.f32 {d12, d13}, [%[in_ptr1]] \n\t" - "add %[in_ptr1], %[in_stride] \n\t" - - "0: \n\t" - // load out_ptr - "pld [%[out_ptr1], #128] \n\t" - "pld [%[out_ptr1_c2], #128] \n\t" - "pld [%[out_ptr1_c3], #128] \n\t" - "pld [%[out_ptr1_c4], #128] \n\t" - "pld [%[out_ptr1_c5], #128] \n\t" - "pld [%[out_ptr1_c6], #128] \n\t" - "pld [%[out_ptr1_c7], #128] \n\t" - "pld [%[out_ptr1_c8], #128] \n\t" - - "vld1.f32 {d16, d17}, [%[out_ptr1]] \n\t" - "vld1.f32 {d18, d19}, [%[out_ptr1_c2]] \n\t" - "vld1.f32 {d20, d21}, [%[out_ptr1_c3]] \n\t" - "vld1.f32 {d22, d23}, [%[out_ptr1_c4]] \n\t" - "vld1.f32 {d24, d25}, [%[out_ptr1_c5]] \n\t" - "vld1.f32 {d26, d27}, [%[out_ptr1_c6]] \n\t" - "vld1.f32 {d28, d29}, [%[out_ptr1_c7]] \n\t" - "vld1.f32 {d30, d31}, [%[out_ptr1_c8]] \n\t" - - // in_ptr1 multiply - "pld [%[f1], #256] \n\t" - "vld1.f32 {d4-d7}, [%[f1]]! \n\t" - "vmla.f32 q8, q4, d0[0] \n\t" - "vmla.f32 q9, q4, d0[1] \n\t" - - "vmla.f32 q10, q4, d1[0] \n\t" - "vmla.f32 q11, q4, d1[1] \n\t" - - "vmla.f32 q12, q4, d2[0] \n\t" - "vmla.f32 q13, q4, d2[1] \n\t" - - "pld [%[f1], #256] \n\t" - "vmla.f32 q14, q4, d3[0] \n\t" - "vmla.f32 q15, q4, d3[1] \n\t" - - "vld1.f32 {d0-d3}, [%[f1]]! \n\t" - "vmla.f32 q8, q5, d4[0] \n\t" - "vmla.f32 q9, q5, d4[1] \n\t" - - "vext.32 q7, q4, q6, #1 \n\t" - "vmla.f32 q10, q5, d5[0] \n\t" - "vmla.f32 q11, q5, d5[1] \n\t" - - "vmla.f32 q12, q5, d6[0] \n\t" - "vmla.f32 q13, q5, d6[1] \n\t" - - "pld [%[in_ptr1], #288] \n\t" - "vmla.f32 q14, q5, d7[0] \n\t" - "vmla.f32 q15, q5, d7[1] \n\t" - - "vld2.f32 {d8-d11}, [%[in_ptr1]]! \n\t" - "vmla.f32 q8, q7, d0[0] \n\t" - "vmla.f32 q9, q7, d0[1] \n\t" - - "pld [%[f1], #256] \n\t" - "vld1.f32 {d4-d7}, [%[f1]]! \n\t" - "vmla.f32 q10, q7, d1[0] \n\t" - "vmla.f32 q11, q7, d1[1] \n\t" - - "vld2.f32 {d12, d13}, [%[in_ptr1]] \n\t" - "add %[in_ptr1], %[in_stride] \n\t" - "vmla.f32 q12, q7, d2[0] \n\t" - "vmla.f32 q13, q7, d2[1] \n\t" - - "pld [%[f1], #256] \n\t" - "vmla.f32 q14, q7, d3[0] \n\t" - "vmla.f32 q15, q7, d3[1] \n\t" - - // in_ptr2 multiply - "vld1.f32 {d0-d3}, [%[f1]]! \n\t" - "vmla.f32 q8, q4, d4[0] \n\t" - "vmla.f32 q9, q4, d4[1] \n\t" - - "vmla.f32 q10, q4, d5[0] \n\t" - "vmla.f32 q11, q4, d5[1] \n\t" - - "vmla.f32 q12, q4, d6[0] \n\t" - "vmla.f32 q13, q4, d6[1] \n\t" - - "pld [%[f1], #256] \n\t" - "vmla.f32 q14, q4, d7[0] \n\t" - "vmla.f32 q15, q4, d7[1] \n\t" - - "vld1.f32 {d4-d7}, [%[f1]]! \n\t" - "vmla.f32 q8, q5, d0[0] \n\t" - "vmla.f32 q9, q5, d0[1] \n\t" - - "vext.32 q7, q4, q6, #1 \n\t" - "vmla.f32 q10, q5, d1[0] \n\t" - "vmla.f32 q11, q5, d1[1] \n\t" - - "vmla.f32 q12, q5, d2[0] \n\t" - "vmla.f32 q13, q5, d2[1] \n\t" - - "pld [%[in_ptr1], #288] \n\t" - "vmla.f32 q14, q5, d3[0] \n\t" - "vmla.f32 q15, q5, d3[1] \n\t" - - "vld2.f32 {d8-d11}, [%[in_ptr1]]! \n\t" - "vmla.f32 q8, q7, d4[0] \n\t" - "vmla.f32 q9, q7, d4[1] \n\t" - - "pld [%[f1], #256] \n\t" - "vld1.f32 {d0-d3}, [%[f1]]! \n\t" - "vmla.f32 q10, q7, d5[0] \n\t" - "vmla.f32 q11, q7, d5[1] \n\t" - - "vld2.f32 {d12, d13}, [%[in_ptr1]] \n\t" - "sub %[in_ptr1], %[in_stride] \n\t" - "sub %[in_ptr1], %[in_stride] \n\t" - "vmla.f32 q12, q7, d6[0] \n\t" - "vmla.f32 q13, q7, d6[1] \n\t" - - "sub %[in_ptr1], #64 \n\t" - "pld [%[f1], #256] \n\t" - "vmla.f32 q14, q7, d7[0] \n\t" - "vmla.f32 q15, q7, d7[1] \n\t" - - // in_ptr3 multiply - "vld1.f32 {d4-d7}, [%[f1]]! \n\t" - "vmla.f32 q8, q4, d0[0] \n\t" - "vmla.f32 q9, q4, d0[1] \n\t" - - "vmla.f32 q10, q4, d1[0] \n\t" - "vmla.f32 q11, q4, d1[1] \n\t" - - "vmla.f32 q12, q4, d2[0] \n\t" - "vmla.f32 q13, q4, d2[1] \n\t" - - "pld [%[f1], #256] \n\t" - "vmla.f32 q14, q4, d3[0] \n\t" - "vmla.f32 q15, q4, d3[1] \n\t" - - "vld1.f32 {d0-d3}, [%[f1]]! \n\t" - "vmla.f32 q8, q5, d4[0] \n\t" - "vmla.f32 q9, q5, d4[1] \n\t" - - "vext.32 q7, q4, q6, #1 \n\t" - "vmla.f32 q10, q5, d5[0] \n\t" - "vmla.f32 q11, q5, d5[1] \n\t" - - "vmla.f32 q12, q5, d6[0] \n\t" - "vmla.f32 q13, q5, d6[1] \n\t" - - "vmla.f32 q14, q5, d7[0] \n\t" - "vmla.f32 q15, q5, d7[1] \n\t" - - "sub %[f1], %[f1], #288 \n\t" - "vmla.f32 q8, q7, d0[0] \n\t" - "vmla.f32 q9, q7, d0[1] \n\t" - - "vmla.f32 q10, q7, d1[0] \n\t" - "vmla.f32 q11, q7, d1[1] \n\t" - - "vmla.f32 q12, q7, d2[0] \n\t" - "vmla.f32 q13, q7, d2[1] \n\t" - - "vmla.f32 q14, q7, d3[0] \n\t" - "vmla.f32 q15, q7, d3[1] \n\t" - - // store out_ptr - "pld [%[f1], #256] \n\t" - "vld1.f32 {d0-d3}, [%[f1]]! \n\t" - - "pld [%[in_ptr1], #288] \n\t" - "vld2.f32 {d8-d11}, [%[in_ptr1]]! \n\t" - "vst1.f32 {d16, d17}, [%[out_ptr1]]! \n\t" - "vst1.f32 {d18, d19}, [%[out_ptr1_c2]]! \n\t" - - "vst1.f32 {d20, d21}, [%[out_ptr1_c3]]! \n\t" - "vst1.f32 {d22, d23}, [%[out_ptr1_c4]]! \n\t" - - "vst1.f32 {d24, d25}, [%[out_ptr1_c5]]! \n\t" - "vst1.f32 {d26, d27}, [%[out_ptr1_c6]]! \n\t" - - "vld2.f32 {d12, d13}, [%[in_ptr1]] \n\t" - "add %[in_ptr1], %[in_stride] \n\t" - "vst1.f32 {d28, d29}, [%[out_ptr1_c7]]! \n\t" - - "subs %[loop], #1 \n\t" - "vst1.f32 {d30, d31}, [%[out_ptr1_c8]]! \n\t" - - // cycle - "bne 0b \n\t" - "sub %[f1], %[f1], #32 \n\t" - "sub %[in_ptr1], %[in_ptr1], #32 \n\t" - "sub %[in_ptr1], %[in_stride] \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [out_ptr1_c2] "+r"(out_ptr1_c2), - [out_ptr1_c3] "+r"(out_ptr1_c3), - [out_ptr1_c4] "+r"(out_ptr1_c4), - [out_ptr1_c5] "+r"(out_ptr1_c5), - [out_ptr1_c6] "+r"(out_ptr1_c6), - [out_ptr1_c7] "+r"(out_ptr1_c7), - [out_ptr1_c8] "+r"(out_ptr1_c8), [in_ptr1] "+r"(in_ptr1) - : [f1] "r"(f1), [in_stride] "r"(in_stride) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); - - in_ptr2 = in_ptr1 + input_w; - in_ptr3 = in_ptr2 + input_w; - } - } -#endif // __aarch64__ -#endif // __ARM_NEON - - // remain output_width - for (; o_w < output_w; ++o_w) { - float sum1 = 0; - float sum1_c2 = 0; - float sum1_c3 = 0; - float sum1_c4 = 0; - float sum1_c5 = 0; - float sum1_c6 = 0; - float sum1_c7 = 0; - float sum1_c8 = 0; -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2); - float32x4_t _pad_filter1_c3 = vld1q_f32(pad_filter1_c3); - float32x4_t _pad_filter1_c4 = vld1q_f32(pad_filter1_c4); - float32x4_t _pad_filter1_c5 = vld1q_f32(pad_filter1_c5); - float32x4_t _pad_filter1_c6 = vld1q_f32(pad_filter1_c6); - float32x4_t _pad_filter1_c7 = vld1q_f32(pad_filter1_c7); - float32x4_t _pad_filter1_c8 = vld1q_f32(pad_filter1_c8); - - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2); - float32x4_t _sum1_c3 = vmulq_f32(_in_ptr1, _pad_filter1_c3); - float32x4_t _sum1_c4 = vmulq_f32(_in_ptr1, _pad_filter1_c4); - float32x4_t _sum1_c5 = vmulq_f32(_in_ptr1, _pad_filter1_c5); - float32x4_t _sum1_c6 = vmulq_f32(_in_ptr1, _pad_filter1_c6); - float32x4_t _sum1_c7 = vmulq_f32(_in_ptr1, _pad_filter1_c7); - float32x4_t _sum1_c8 = vmulq_f32(_in_ptr1, _pad_filter1_c8); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2); - float32x4_t _pad_filter2_c3 = vld1q_f32(pad_filter2_c3); - float32x4_t _pad_filter2_c4 = vld1q_f32(pad_filter2_c4); - float32x4_t _pad_filter2_c5 = vld1q_f32(pad_filter2_c5); - float32x4_t _pad_filter2_c6 = vld1q_f32(pad_filter2_c6); - float32x4_t _pad_filter2_c7 = vld1q_f32(pad_filter2_c7); - float32x4_t _pad_filter2_c8 = vld1q_f32(pad_filter2_c8); - - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2); - _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr2, _pad_filter2_c3); - _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr2, _pad_filter2_c4); - _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr2, _pad_filter2_c5); - _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr2, _pad_filter2_c6); - _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr2, _pad_filter2_c7); - _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr2, _pad_filter2_c8); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2); - float32x4_t _pad_filter3_c3 = vld1q_f32(pad_filter3_c3); - float32x4_t _pad_filter3_c4 = vld1q_f32(pad_filter3_c4); - float32x4_t _pad_filter3_c5 = vld1q_f32(pad_filter3_c5); - float32x4_t _pad_filter3_c6 = vld1q_f32(pad_filter3_c6); - float32x4_t _pad_filter3_c7 = vld1q_f32(pad_filter3_c7); - float32x4_t _pad_filter3_c8 = vld1q_f32(pad_filter3_c8); - - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2); - _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr3, _pad_filter3_c3); - _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr3, _pad_filter3_c4); - _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr3, _pad_filter3_c5); - _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr3, _pad_filter3_c6); - _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr3, _pad_filter3_c7); - _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr3, _pad_filter3_c8); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3); - _sum1_c3 = vsetq_lane_f32(sum1_c3, _sum1_c3, 3); - _sum1_c4 = vsetq_lane_f32(sum1_c4, _sum1_c4, 3); - _sum1_c5 = vsetq_lane_f32(sum1_c5, _sum1_c5, 3); - _sum1_c6 = vsetq_lane_f32(sum1_c6, _sum1_c6, 3); - _sum1_c7 = vsetq_lane_f32(sum1_c7, _sum1_c7, 3); - _sum1_c8 = vsetq_lane_f32(sum1_c8, _sum1_c8, 3); - - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ss1_2 = - vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2)); - float32x2_t _ss1_3 = - vadd_f32(vget_low_f32(_sum1_c3), vget_high_f32(_sum1_c3)); - float32x2_t _ss1_4 = - vadd_f32(vget_low_f32(_sum1_c4), vget_high_f32(_sum1_c4)); - float32x2_t _ss1_5 = - vadd_f32(vget_low_f32(_sum1_c5), vget_high_f32(_sum1_c5)); - float32x2_t _ss1_6 = - vadd_f32(vget_low_f32(_sum1_c6), vget_high_f32(_sum1_c6)); - float32x2_t _ss1_7 = - vadd_f32(vget_low_f32(_sum1_c7), vget_high_f32(_sum1_c7)); - float32x2_t _ss1_8 = - vadd_f32(vget_low_f32(_sum1_c8), vget_high_f32(_sum1_c8)); - - float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2); - float32x2_t _ssss1_3_ssss1_4 = vpadd_f32(_ss1_3, _ss1_4); - float32x2_t _ssss1_5_ssss1_6 = vpadd_f32(_ss1_5, _ss1_6); - float32x2_t _ssss1_7_ssss1_8 = vpadd_f32(_ss1_7, _ss1_8); - - sum1 += vget_lane_f32(_ssss1_ssss1_2, 0); - sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1); - sum1_c3 += vget_lane_f32(_ssss1_3_ssss1_4, 0); - sum1_c4 += vget_lane_f32(_ssss1_3_ssss1_4, 1); - sum1_c5 += vget_lane_f32(_ssss1_5_ssss1_6, 0); - sum1_c6 += vget_lane_f32(_ssss1_5_ssss1_6, 1); - sum1_c7 += vget_lane_f32(_ssss1_7_ssss1_8, 0); - sum1_c8 += vget_lane_f32(_ssss1_7_ssss1_8, 1); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; - - sum1_c2 += in_ptr1[0] * pad_filter1_c2[0]; - sum1_c2 += in_ptr1[1] * pad_filter1_c2[1]; - sum1_c2 += in_ptr1[2] * pad_filter1_c2[2]; - sum1_c2 += in_ptr2[0] * pad_filter2_c2[0]; - sum1_c2 += in_ptr2[1] * pad_filter2_c2[1]; - sum1_c2 += in_ptr2[2] * pad_filter2_c2[2]; - sum1_c2 += in_ptr3[0] * pad_filter3_c2[0]; - sum1_c2 += in_ptr3[1] * pad_filter3_c2[1]; - sum1_c2 += in_ptr3[2] * pad_filter3_c2[2]; - - sum1_c3 += in_ptr1[0] * pad_filter1_c3[0]; - sum1_c3 += in_ptr1[1] * pad_filter1_c3[1]; - sum1_c3 += in_ptr1[2] * pad_filter1_c3[2]; - sum1_c3 += in_ptr2[0] * pad_filter2_c3[0]; - sum1_c3 += in_ptr2[1] * pad_filter2_c3[1]; - sum1_c3 += in_ptr2[2] * pad_filter2_c3[2]; - sum1_c3 += in_ptr3[0] * pad_filter3_c3[0]; - sum1_c3 += in_ptr3[1] * pad_filter3_c3[1]; - sum1_c3 += in_ptr3[2] * pad_filter3_c3[2]; - - sum1_c4 += in_ptr1[0] * pad_filter1_c4[0]; - sum1_c4 += in_ptr1[1] * pad_filter1_c4[1]; - sum1_c4 += in_ptr1[2] * pad_filter1_c4[2]; - sum1_c4 += in_ptr2[0] * pad_filter2_c4[0]; - sum1_c4 += in_ptr2[1] * pad_filter2_c4[1]; - sum1_c4 += in_ptr2[2] * pad_filter2_c4[2]; - sum1_c4 += in_ptr3[0] * pad_filter3_c4[0]; - sum1_c4 += in_ptr3[1] * pad_filter3_c4[1]; - sum1_c4 += in_ptr3[2] * pad_filter3_c4[2]; - - sum1_c5 += in_ptr1[0] * pad_filter1_c5[0]; - sum1_c5 += in_ptr1[1] * pad_filter1_c5[1]; - sum1_c5 += in_ptr1[2] * pad_filter1_c5[2]; - sum1_c5 += in_ptr2[0] * pad_filter2_c5[0]; - sum1_c5 += in_ptr2[1] * pad_filter2_c5[1]; - sum1_c5 += in_ptr2[2] * pad_filter2_c5[2]; - sum1_c5 += in_ptr3[0] * pad_filter3_c5[0]; - sum1_c5 += in_ptr3[1] * pad_filter3_c5[1]; - sum1_c5 += in_ptr3[2] * pad_filter3_c5[2]; - - sum1_c6 += in_ptr1[0] * pad_filter1_c6[0]; - sum1_c6 += in_ptr1[1] * pad_filter1_c6[1]; - sum1_c6 += in_ptr1[2] * pad_filter1_c6[2]; - sum1_c6 += in_ptr2[0] * pad_filter2_c6[0]; - sum1_c6 += in_ptr2[1] * pad_filter2_c6[1]; - sum1_c6 += in_ptr2[2] * pad_filter2_c6[2]; - sum1_c6 += in_ptr3[0] * pad_filter3_c6[0]; - sum1_c6 += in_ptr3[1] * pad_filter3_c6[1]; - sum1_c6 += in_ptr3[2] * pad_filter3_c6[2]; - - sum1_c7 += in_ptr1[0] * pad_filter1_c7[0]; - sum1_c7 += in_ptr1[1] * pad_filter1_c7[1]; - sum1_c7 += in_ptr1[2] * pad_filter1_c7[2]; - sum1_c7 += in_ptr2[0] * pad_filter2_c7[0]; - sum1_c7 += in_ptr2[1] * pad_filter2_c7[1]; - sum1_c7 += in_ptr2[2] * pad_filter2_c7[2]; - sum1_c7 += in_ptr3[0] * pad_filter3_c7[0]; - sum1_c7 += in_ptr3[1] * pad_filter3_c7[1]; - sum1_c7 += in_ptr3[2] * pad_filter3_c7[2]; - - sum1_c8 += in_ptr1[0] * pad_filter1_c8[0]; - sum1_c8 += in_ptr1[1] * pad_filter1_c8[1]; - sum1_c8 += in_ptr1[2] * pad_filter1_c8[2]; - sum1_c8 += in_ptr2[0] * pad_filter2_c8[0]; - sum1_c8 += in_ptr2[1] * pad_filter2_c8[1]; - sum1_c8 += in_ptr2[2] * pad_filter2_c8[2]; - sum1_c8 += in_ptr3[0] * pad_filter3_c8[0]; - sum1_c8 += in_ptr3[1] * pad_filter3_c8[1]; - sum1_c8 += in_ptr3[2] * pad_filter3_c8[2]; -#endif - if (if_nopadding) { - in_ptr1 += 2; - in_ptr2 += 2; - in_ptr3 += 2; - } else if (input_w > 3 && - (if_odd_pad_w && o_w == valid_w_start || - o_w == valid_w_end && if_odd_pad_w && if_exact_in_w || - o_w == valid_w_end + 1 && !if_odd_pad_w && - !if_exact_in_w)) { - pad_filter1--; - pad_filter2--; - pad_filter3--; - pad_filter1_c2--; - pad_filter2_c2--; - pad_filter3_c2--; - - pad_filter1_c3--; - pad_filter2_c3--; - pad_filter3_c3--; - pad_filter1_c4--; - pad_filter2_c4--; - pad_filter3_c4--; - - pad_filter1_c5--; - pad_filter2_c5--; - pad_filter3_c5--; - pad_filter1_c6--; - pad_filter2_c6--; - pad_filter3_c6--; - - pad_filter1_c7--; - pad_filter2_c7--; - pad_filter3_c7--; - pad_filter1_c8--; - pad_filter2_c8--; - pad_filter3_c8--; - - in_ptr1++; - in_ptr2++; - in_ptr3++; - } else if (input_w <= 3 || o_w < valid_w_start || - o_w > valid_w_end) { - pad_filter1 -= 2; - pad_filter2 -= 2; - pad_filter3 -= 2; - pad_filter1_c2 -= 2; - pad_filter2_c2 -= 2; - pad_filter3_c2 -= 2; - - pad_filter1_c3 -= 2; - pad_filter2_c3 -= 2; - pad_filter3_c3 -= 2; - pad_filter1_c4 -= 2; - pad_filter2_c4 -= 2; - pad_filter3_c4 -= 2; - - pad_filter1_c5 -= 2; - pad_filter2_c5 -= 2; - pad_filter3_c5 -= 2; - pad_filter1_c6 -= 2; - pad_filter2_c6 -= 2; - pad_filter3_c6 -= 2; - - pad_filter1_c7 -= 2; - pad_filter2_c7 -= 2; - pad_filter3_c7 -= 2; - pad_filter1_c8 -= 2; - pad_filter2_c8 -= 2; - pad_filter3_c8 -= 2; - } else { - in_ptr1 += 2; - in_ptr2 += 2; - in_ptr3 += 2; - } - *out_ptr1 += sum1; - *out_ptr1_c2 += sum1_c2; - *out_ptr1_c3 += sum1_c3; - *out_ptr1_c4 += sum1_c4; - *out_ptr1_c5 += sum1_c5; - *out_ptr1_c6 += sum1_c6; - *out_ptr1_c7 += sum1_c7; - *out_ptr1_c8 += sum1_c8; - - out_ptr1++; - out_ptr1_c2++; - out_ptr1_c3++; - out_ptr1_c4++; - out_ptr1_c5++; - out_ptr1_c6++; - out_ptr1_c7++; - out_ptr1_c8++; - } - if (if_nopadding) { - in_ptr1 += remain_stride_w + input_w; - in_ptr2 += remain_stride_w + input_w; - in_ptr3 += remain_stride_w + input_w; - - } else if (input_h > 3 && - (if_odd_pad_h && o_h == valid_h_start || - o_h == valid_h_end && if_odd_pad_h && if_exact_in_h || - o_h == valid_h_end + 1 && !if_odd_pad_h && - !if_exact_in_h)) { - in_ptr1 += 3; - in_ptr2 += 3; - in_ptr3 += 3; - - pad_filter1 -= remain_stride_w; - pad_filter2 -= remain_stride_w; - pad_filter3 -= remain_stride_w; - pad_filter1_c2 -= remain_stride_w; - pad_filter2_c2 -= remain_stride_w; - pad_filter3_c2 -= remain_stride_w; - - pad_filter1_c3 -= remain_stride_w; - pad_filter2_c3 -= remain_stride_w; - pad_filter3_c3 -= remain_stride_w; - pad_filter1_c4 -= remain_stride_w; - pad_filter2_c4 -= remain_stride_w; - pad_filter3_c4 -= remain_stride_w; - - pad_filter1_c5 -= remain_stride_w; - pad_filter2_c5 -= remain_stride_w; - pad_filter3_c5 -= remain_stride_w; - pad_filter1_c6 -= remain_stride_w; - pad_filter2_c6 -= remain_stride_w; - pad_filter3_c6 -= remain_stride_w; - - pad_filter1_c7 -= remain_stride_w; - pad_filter2_c7 -= remain_stride_w; - pad_filter3_c7 -= remain_stride_w; - pad_filter1_c8 -= remain_stride_w; - pad_filter2_c8 -= remain_stride_w; - pad_filter3_c8 -= remain_stride_w; - } else if (input_h <= 3 || o_h < valid_h_start || o_h > valid_h_end) { - in_ptr1 -= input_w - 3; - in_ptr2 -= input_w - 3; - in_ptr3 -= input_w - 3; - - pad_filter1 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter1_c2 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2_c2 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3_c2 -= 3 + 2 * padding_w + remain_stride_w; - - pad_filter1_c3 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2_c3 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3_c3 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter1_c4 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2_c4 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3_c4 -= 3 + 2 * padding_w + remain_stride_w; - - pad_filter1_c5 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2_c5 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3_c5 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter1_c6 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2_c6 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3_c6 -= 3 + 2 * padding_w + remain_stride_w; - - pad_filter1_c7 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2_c7 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3_c7 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter1_c8 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2_c8 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3_c8 -= 3 + 2 * padding_w + remain_stride_w; - } else { - pad_filter1 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3 += 3 + 2 * padding_w - remain_stride_w; - pad_filter1_c2 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2_c2 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3_c2 += 3 + 2 * padding_w - remain_stride_w; - - pad_filter1_c3 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2_c3 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3_c3 += 3 + 2 * padding_w - remain_stride_w; - pad_filter1_c4 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2_c4 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3_c4 += 3 + 2 * padding_w - remain_stride_w; - - pad_filter1_c5 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2_c5 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3_c5 += 3 + 2 * padding_w - remain_stride_w; - pad_filter1_c6 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2_c6 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3_c6 += 3 + 2 * padding_w - remain_stride_w; - - pad_filter1_c7 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2_c7 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3_c7 += 3 + 2 * padding_w - remain_stride_w; - pad_filter1_c8 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2_c8 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3_c8 += 3 + 2 * padding_w - remain_stride_w; - - in_ptr1 += input_w + 3; - in_ptr2 += input_w + 3; - in_ptr3 += input_w + 3; - } - } - - filter_data_ch += filter_ch_size; - filter_data_ch_c2 += filter_ch_size; - filter_data_ch_c3 += filter_ch_size; - filter_data_ch_c4 += filter_ch_size; - filter_data_ch_c5 += filter_ch_size; - filter_data_ch_c6 += filter_ch_size; - filter_data_ch_c7 += filter_ch_size; - filter_data_ch_c8 += filter_ch_size; - input_data_ch += in_ch_size; - } - } - - int out_ch_remain_start = output_ch - output_ch % 8; - - // remain output_channel -#pragma omp parallel for - for (int o_c = out_ch_remain_start; o_c < output_ch; ++o_c) { - const float *f1, *f9; - const float *in_ptr1, *in_ptr2, *in_ptr3; - const float *pad_filter1, *pad_filter2, *pad_filter3; - float pad_filter_arr[pad_filter_ch_size]; - float *output_data_ch; - const float *input_data_ch; - const float *filter_data_ch; - - filter_data_ch = filter_data + o_c * filter_ch_size * input_ch; - input_data_ch = input_data; - output_data_ch = output_data + o_c * out_ch_size; - - for (int i_c = 0; i_c < input_ch; ++i_c) { - f1 = filter_data_ch; - f9 = f1 + 8; - - if (!if_nopadding) { - memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr)); - for (int i = 0; i < 9; ++i) { - int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 + - padding_w * (2 * padding_h + 1); - pad_filter_arr[j] = filter_data_ch[i]; - } - pad_filter1 = pad_filter_arr; - pad_filter1 += pad_filter_start; - pad_filter2 = pad_filter1 + pad_filter_w; - pad_filter3 = pad_filter2 + pad_filter_w; - } else { - pad_filter1 = filter_data_ch; - pad_filter2 = pad_filter1 + 3; - pad_filter3 = pad_filter2 + 3; - } - - float *out_ptr1; - out_ptr1 = output_data_ch; - in_ptr1 = input_data_ch; - in_ptr2 = in_ptr1 + input_w; - in_ptr3 = in_ptr2 + input_w; - - int o_h = 0; - for (; o_h < output_h; ++o_h) { - int o_w = 0; - - // pad left - for (; o_w <= valid_w_start; ++o_w) { - float sum1 = 0; -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1); - sum1 += vget_lane_f32(_ssss1_ssss1, 0); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; -#endif - if (if_nopadding) { - in_ptr1 += 2; - in_ptr2 += 2; - in_ptr3 += 2; - } else if (input_w > 3 && - (if_odd_pad_w && o_w == valid_w_start || - o_w == valid_w_end && if_odd_pad_w && if_exact_in_w || - o_w == valid_w_end + 1 && !if_odd_pad_w && - !if_exact_in_w)) { - pad_filter1--; - pad_filter2--; - pad_filter3--; - in_ptr1++; - in_ptr2++; - in_ptr3++; - - } else if (input_w <= 3 || o_w < valid_w_start || - o_w > valid_w_end) { - pad_filter1 -= 2; - pad_filter2 -= 2; - pad_filter3 -= 2; - } else { - in_ptr1 += 2; - in_ptr2 += 2; - in_ptr3 += 2; - } - *out_ptr1 += sum1; - out_ptr1++; - } - // valid -#if __ARM_NEON -#if __aarch64__ - if (o_h > valid_h_start && o_h < valid_h_end) { - int loop = (valid_w_end - valid_w_start - 1) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - "prfm pldl1keep, [%[f1], #256] \n\t" - "prfm pldl1keep, [%[f9], #256] \n\t" - - "ld1 {v0.4s, v1.4s}, [%[f1]] \n\t" - "ld1 {v4.s}[0], [%[f9]] \n\t" - - "0: \n\t" - // load out_ptr - "prfm pldl1keep, [%[out_ptr1], #128] \n\t" - "ld1 {v12.4s}, [%[out_ptr1]] \n\t" - - // in_ptr1 multiply - "prfm pldl1keep, [%[in_ptr1], #256] \n\t" - "ld2 {v5.4s, v6.4s}, [%[in_ptr1]], #32 \n\t" - "ld2 {v7.4s, v8.4s}, [%[in_ptr1]] \n\t" - - "fmla v12.4s, v5.4s, v0.s[0] \n\t" - "fmla v14.4s, v5.4s, v2.s[0] \n\t" - - "ext v8.16b, v5.16b, v7.16b, #4 \n\t" - "fmul v13.4s, v6.4s, v0.s[1] \n\t" - "fmla v12.4s, v8.4s, v0.s[2] \n\t" - - "ld2 {v5.4s, v6.4s}, [%[in_ptr2]], #32 \n\t" - "ld2 {v7.4s, v8.4s}, [%[in_ptr2]] \n\t" - - // in_ptr2 multiply - "fmla v13.4s, v5.4s, v0.s[3] \n\t" - "ext v8.16b, v5.16b, v7.16b, #4 \n\t" - "fmla v12.4s, v6.4s, v1.s[0] \n\t" - - "fmla v13.4s, v8.4s, v1.s[1] \n\t" - "ld2 {v5.4s, v6.4s}, [%[in_ptr3]], #32 \n\t" - "ld2 {v7.4s, v8.4s}, [%[in_ptr3]] \n\t" - - // in_ptr3 multiply - "fmla v12.4s, v5.4s, v1.s[2] \n\t" - "ext v8.16b, v5.16b, v7.16b, #4 \n\t" - - "fmla v13.4s, v6.4s, v1.s[3] \n\t" - "fmla v12.4s, v8.4s, v4.s[0] \n\t" - - // store out_ptr - "fadd v12.4s, v12.4s, v13.4s \n\t" - "st1 {v12.4s}, [%[out_ptr1]], #16 \n\t" - - // cycle - "subs %[loop], %[loop], #1 \n\t" - "bne 0b \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [in_ptr1] "+r"(in_ptr1), [in_ptr2] "+r"(in_ptr2), - [in_ptr3] "+r"(in_ptr3) - : [f1] "r"(f1), [f9] "r"(f9) - : "cc", "memory", "v0", "v1", "v4", "v5", "v6", "v7", "v8", - "v12", "v13"); - } - } -#else - if (o_h > valid_h_start && o_h < valid_h_end) { - int loop = (valid_w_end - valid_w_start - 1) >> 2; - o_w += loop * 4; - - if (loop > 0) { - asm volatile( - "pld [%[f1], #256] \n\t" - "pld [%[f9], #256] \n\t" - - "vld1.f32 {d0-d3}, [%[f1]] \n\t" - "vld1.f32 {d8[0]}, [%[f9]] \n\t" - - "pld [%[in_ptr1], #256] \n\t" - "vld2.f32 {d10-d13}, [%[in_ptr1]]! \n\t" - "vld2.f32 {d14, d15}, [%[in_ptr1]] \n\t" - - "0: \n\t" - // load out_ptr - "pld [%[out_ptr1], #128] \n\t" - "vld1.f32 {d24, d25}, [%[out_ptr1]] \n\t" - - // in_ptr1 multiply - "pld [%[in_ptr2], #256] \n\t" - "vld2.f32 {d4-d7}, [%[in_ptr2]]! \n\t" - - "vmla.f32 q12, q5, d0[0] \n\t" - "vld2.f32 {d20, d21}, [%[in_ptr2]] \n\t" - "vext.32 q8, q5, q7, #1 \n\t" - - "pld [%[in_ptr3], #256] \n\t" - "vmul.f32 q13, q6, d0[1] \n\t" - - "vld2.f32 {d10-d13}, [%[in_ptr3]]! \n\t" - "vmul.f32 q14, q8, d1[0] \n\t" - "vld2.f32 {d14, d15}, [%[in_ptr3]] \n\t" - - // in_ptr2 multiply - "vmul.f32 q15, q2, d1[1] \n\t" - "vext.32 q8, q2, q10, #1 \n\t" - - "vmla.f32 q12, q3, d2[0] \n\t" - "vmla.f32 q13, q8, d2[1] \n\t" - - // in_ptr3 multiply - "vmla.f32 q14, q5, d3[0] \n\t" - "vext.32 q8, q5, q7, #1 \n\t" - - "pld [%[in_ptr1], #256] \n\t" - "vmla.f32 q15, q6, d3[1] \n\t" - - "vld2.f32 {d10-d13}, [%[in_ptr1]]! \n\t" - "vmla.f32 q13, q8, d8[0] \n\t" - - // store out_ptr - "vld2.f32 {d14, d15}, [%[in_ptr1]] \n\t" - "vadd.f32 q12, q12, q13 \n\t" - "subs %[loop], #1 \n\t" - - "vadd.f32 q14, q14, q15 \n\t" - "vadd.f32 q12, q12, q14 \n\t" - "vst1.f32 {d24, d25}, [%[out_ptr1]]! \n\t" - - // cycle - "bne 0b \n\t" - "subs %[in_ptr1], %[in_ptr1], #32 \n\t" - - : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1), - [in_ptr1] "+r"(in_ptr1), [in_ptr2] "+r"(in_ptr2), - [in_ptr3] "+r"(in_ptr3) - : [f1] "r"(f1), [f9] "r"(f9) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q8", "q10", "q12", "q13", "q14", "q15"); - } - } -#endif // __aarch64__ -#endif // __ARM_NEON - out_ptr1 -= 4; - out_ptr1 += 4; - - // remain output_width - for (; o_w < output_w; ++o_w) { - float sum1 = 0; -#if __ARM_NEON - float32x4_t _in_ptr1 = vld1q_f32(in_ptr1); - float32x4_t _pad_filter1 = vld1q_f32(pad_filter1); - float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1); - - float32x4_t _in_ptr2 = vld1q_f32(in_ptr2); - float32x4_t _pad_filter2 = vld1q_f32(pad_filter2); - _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2); - - float32x4_t _in_ptr3 = vld1q_f32(in_ptr3); - float32x4_t _pad_filter3 = vld1q_f32(pad_filter3); - _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3); - - _sum1 = vsetq_lane_f32(sum1, _sum1, 3); - float32x2_t _ss1 = - vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); - float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1); - sum1 += vget_lane_f32(_ssss1_ssss1, 0); -#else - sum1 += in_ptr1[0] * pad_filter1[0]; - sum1 += in_ptr1[1] * pad_filter1[1]; - sum1 += in_ptr1[2] * pad_filter1[2]; - sum1 += in_ptr2[0] * pad_filter2[0]; - sum1 += in_ptr2[1] * pad_filter2[1]; - sum1 += in_ptr2[2] * pad_filter2[2]; - sum1 += in_ptr3[0] * pad_filter3[0]; - sum1 += in_ptr3[1] * pad_filter3[1]; - sum1 += in_ptr3[2] * pad_filter3[2]; -#endif - if (if_nopadding) { - in_ptr1 += 2; - in_ptr2 += 2; - in_ptr3 += 2; - } else if (input_w > 3 && - (if_odd_pad_w && o_w == valid_w_start || - o_w == valid_w_end && if_odd_pad_w && if_exact_in_w || - o_w == valid_w_end + 1 && !if_odd_pad_w && - !if_exact_in_w)) { - pad_filter1--; - pad_filter2--; - pad_filter3--; - - in_ptr1++; - in_ptr2++; - in_ptr3++; - - } else if (input_w <= 3 || o_w < valid_w_start || - o_w > valid_w_end) { - pad_filter1 -= 2; - pad_filter2 -= 2; - pad_filter3 -= 2; - } else { - in_ptr1 += 2; - in_ptr2 += 2; - in_ptr3 += 2; - } - *out_ptr1 += sum1; - out_ptr1++; - } - if (if_nopadding) { - in_ptr1 += remain_stride_w + input_w; - in_ptr2 += remain_stride_w + input_w; - in_ptr3 += remain_stride_w + input_w; - } else if (input_h > 3 && - (if_odd_pad_h && o_h == valid_h_start || - o_h == valid_h_end && if_odd_pad_h && if_exact_in_h || - o_h == valid_h_end + 1 && !if_odd_pad_h && - !if_exact_in_h)) { - in_ptr1 += 3; - in_ptr2 += 3; - in_ptr3 += 3; - - pad_filter1 -= remain_stride_w; - pad_filter2 -= remain_stride_w; - pad_filter3 -= remain_stride_w; - - } else if (input_h <= 3 || o_h < valid_h_start || o_h > valid_h_end) { - in_ptr1 -= input_w - 3; - in_ptr2 -= input_w - 3; - in_ptr3 -= input_w - 3; - - pad_filter1 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter2 -= 3 + 2 * padding_w + remain_stride_w; - pad_filter3 -= 3 + 2 * padding_w + remain_stride_w; - } else { - pad_filter1 += 3 + 2 * padding_w - remain_stride_w; - pad_filter2 += 3 + 2 * padding_w - remain_stride_w; - pad_filter3 += 3 + 2 * padding_w - remain_stride_w; - - in_ptr1 += input_w + 3; - in_ptr2 += input_w + 3; - in_ptr3 += input_w + 3; - } - } - filter_data_ch += filter_ch_size; - input_data_ch += in_ch_size; - } - } - input_data += in_batch_size; - output_data += out_batch_size; - } -} - -template <> -void SlidingwindowConv3x3s1Faster( - const framework::Tensor *input, framework::Tensor *filter, - const std::vector &paddings, framework::Tensor *output, - const float *bias, bool is_bias, bool is_relu) { - const float *din = input->data(); - float *dout = output->mutable_data(); - const float *weights = filter->mutable_data(); - if (!is_bias) { - bias = nullptr; - } - bool relu = is_relu; - const int num = input->dims()[0]; - const int chin = input->dims()[1]; - const int hin = input->dims()[2]; - const int win = input->dims()[3]; - const int chout = output->dims()[1]; - const int hout = output->dims()[2]; - const int wout = output->dims()[3]; - const int pad_h = paddings[0]; - const int pad_w = paddings[1]; - const int threads = framework::CPUContext::Context()->get_thread_num(); - int l2_size = - framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float); - - const int hout_c_block = 4; - const int hout_r_kernel = 2; - const int wout_block = 4; - const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block; - const int win_round = wout_round + 2; - - int hout_r_block = (l2_size - 2 * win_round * chin) / - (win_round * chin + hout_c_block * wout_round * threads); - hout_r_block = hout_r_block > hout ? hout : hout_r_block; - hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel; - hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block; - - const int hin_r_block = hout_r_block + 2; - - float ptr_zero[win_round]; - memset(ptr_zero, 0, sizeof(float) * win_round); - float ptr_write[wout_round]; - - int in_len = win_round * chin; - int pre_in_size = hin_r_block * in_len; - int pre_out_size = hout_c_block * hout_r_block * wout_round; - - float *pre_din = - static_cast(framework::CPUContext::Context()->get_work_space( - (pre_in_size + threads * pre_out_size) * sizeof(float))); - - int size_in_channel = win * hin; - int size_out_channel = wout * hout; - int w_stride = chin * 9; // kernel_w * kernel_h; - int w_stride_chin = hout_c_block * 9; // kernel_w * kernel_h * - - int ws = -pad_w; - int we = ws + win_round; - int w_loop = wout_round / 4; - - int c_remain = chout - (chout / hout_c_block) * hout_c_block; - int c_round_down = (chout / hout_c_block) * hout_c_block; - - int out_row_stride = hout_c_block * wout_round; - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * chin * size_in_channel; - float *dout_batch = dout + n * chout * size_out_channel; - for (int h = 0; h < hout; h += hout_r_block) { - int h_kernel = hout_r_block; - if (h + hout_r_block > hout) { - h_kernel = hout - h; - } - int hs = h - pad_h; - int he = hs + h_kernel + 2; - slidingwindow_prepack_input(din_batch, pre_din, 0, chin, hs, he, ws, we, - chin, win, hin, ptr_zero); -#pragma omp parallel for - for (int c = 0; c < chout - (hout_c_block - 1); c += hout_c_block) { -#ifdef _OPENMP - float *pre_out = - pre_din + pre_in_size + omp_get_thread_num() * pre_out_size; -#else - float *pre_out = pre_din + pre_in_size; -#endif - const float *block_inr0 = pre_din; - const float *block_inr1 = block_inr0 + in_len; - const float *block_inr2 = block_inr1 + in_len; - const float *block_inr3 = block_inr2 + in_len; - - const float *weight_c = weights + c * w_stride; - const float *bias_ptr = ptr_zero; - if (bias != nullptr) { - bias_ptr = bias + c; - } - slidingwindow_fill_bias(pre_out, bias_ptr, - wout_round * hout_c_block * h_kernel); - - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const float *wc0 = weight_c; - - const float *inr0 = block_inr0; - const float *inr1 = block_inr1; - const float *inr2 = block_inr2; - const float *inr3 = block_inr3; - - float *pre_out0 = pre_out + hk * out_row_stride; - float *pre_out1 = pre_out0 + out_row_stride; -#ifdef __aarch64__ - for (int i = 0; i < chin; ++i) { - float *ptr_out0 = pre_out0; - float *ptr_out1 = pre_out1; - - float32x4_t w0 = vld1q_f32(wc0); // w0, v23 - float32x4_t w1 = vld1q_f32(wc0 + 4); // w1, v24 - float32x4_t w2 = vld1q_f32(wc0 + 8); // w2, v25 - float32x4_t w3 = vld1q_f32(wc0 + 12); // w3, v26 - float32x4_t w4 = vld1q_f32(wc0 + 16); // w4, v27 - float32x4_t w5 = vld1q_f32(wc0 + 20); // w5, v28 - float32x4_t w6 = vld1q_f32(wc0 + 24); // w6, v29 - float32x4_t w7 = vld1q_f32(wc0 + 28); // w7, v30 - float32x4_t w8 = vld1q_f32(wc0 + 32); // w8, v31 - - const float *r0 = inr0; - const float *r1 = inr1; - const float *r2 = inr2; - const float *r3 = inr3; - - int cnt = w_loop; - asm volatile( - "ldp q15, q16, [%[ptr_out0]] \n" /* load outr00, outr01*/ - "ldp q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/ - "ldp q19, q20, [%[ptr_out1]] \n" /* load outr10, outr11*/ - "ldp q21, q22, [%[ptr_out1], #32]\n" /* load outr10, outr11*/ - "ldp q0, q1, [%[r0]], #16 \n" /* load input r0*/ - "ldp q2, q3, [%[r1]], #16 \n" /* load input r1*/ - "2: \n" /* main loop*/ - /* r0, r1, mul w0, get out r0, r1 */ - "fmla v15.4s , %[w0].4s, v0.s[0]\n" /* outr00 = w0 * r0[0]*/ - "fmla v16.4s , %[w0].4s, v0.s[1]\n" /* outr01 = w0 * r0[1]*/ - "fmla v17.4s , %[w0].4s, v0.s[2]\n" /* outr02 = w0 * r0[2]*/ - "fmla v18.4s , %[w0].4s, v0.s[3]\n" /* outr03 = w0 * r0[3]*/ - "fmla v19.4s , %[w0].4s, v2.s[0]\n" /* outr10 = w0 * r1[0]*/ - "fmla v20.4s , %[w0].4s, v2.s[1]\n" /* outr11 = w0 * r1[1]*/ - "fmla v21.4s , %[w0].4s, v2.s[2]\n" /* outr12 = w0 * r1[2]*/ - "fmla v22.4s , %[w0].4s, v2.s[3]\n" /* outr13 = w0 * r1[3]*/ - - /* r0, r1, mul w1, get out r0, r1 */ - "fmla v15.4s , %[w1].4s, v0.s[1]\n" /* outr00 = w1 * r0[1]*/ - "fmla v16.4s , %[w1].4s, v0.s[2]\n" /* outr01 = w1 * r0[2]*/ - "fmla v17.4s , %[w1].4s, v0.s[3]\n" /* outr02 = w1 * r0[3]*/ - "fmla v18.4s , %[w1].4s, v1.s[0]\n" /* outr03 = w1 * r0[4]*/ - "fmla v19.4s , %[w1].4s, v2.s[1]\n" /* outr10 = w1 * r1[1]*/ - "fmla v20.4s , %[w1].4s, v2.s[2]\n" /* outr11 = w1 * r1[2]*/ - "fmla v21.4s , %[w1].4s, v2.s[3]\n" /* outr12 = w1 * r1[3]*/ - "fmla v22.4s , %[w1].4s, v3.s[0]\n" /* outr13 = w1 * r1[4]*/ - - "ldp q4, q5, [%[r2]], #16 \n" /* load input r2*/ - - /* r0, r1, mul w2, get out r0, r1 */ - "fmla v15.4s , %[w2].4s, v0.s[2]\n" /* outr00 = w2 * r0[2]*/ - "fmla v16.4s , %[w2].4s, v0.s[3]\n" /* outr01 = w2 * r0[3]*/ - "fmla v17.4s , %[w2].4s, v1.s[0]\n" /* outr02 = w2 * r0[0]*/ - "fmla v18.4s , %[w2].4s, v1.s[1]\n" /* outr03 = w2 * r0[1]*/ - "fmla v19.4s , %[w2].4s, v2.s[2]\n" /* outr10 = w2 * r1[2]*/ - "fmla v20.4s , %[w2].4s, v2.s[3]\n" /* outr11 = w2 * r1[3]*/ - "fmla v21.4s , %[w2].4s, v3.s[0]\n" /* outr12 = w2 * r1[0]*/ - "fmla v22.4s , %[w2].4s, v3.s[1]\n" /* outr13 = w2 * r1[1]*/ - - /* r1, r2, mul w3, get out r0, r1 */ - "fmla v15.4s , %[w3].4s, v2.s[0]\n" /* outr00 = w3 * r1[0]*/ - "fmla v16.4s , %[w3].4s, v2.s[1]\n" /* outr01 = w3 * r1[1]*/ - "fmla v17.4s , %[w3].4s, v2.s[2]\n" /* outr02 = w3 * r1[2]*/ - "fmla v18.4s , %[w3].4s, v2.s[3]\n" /* outr03 = w3 * r1[3]*/ - "fmla v19.4s , %[w3].4s, v4.s[0]\n" /* outr10 = w3 * r2[0]*/ - "fmla v20.4s , %[w3].4s, v4.s[1]\n" /* outr11 = w3 * r2[1]*/ - "fmla v21.4s , %[w3].4s, v4.s[2]\n" /* outr12 = w3 * r2[2]*/ - "fmla v22.4s , %[w3].4s, v4.s[3]\n" /* outr13 = w3 * r2[3]*/ - - "ldp q0, q1, [%[r0]], #16 \n" /* load next input r0*/ - - /* r1, r2, mul w4, get out r0, r1 */ - "fmla v15.4s , %[w4].4s, v2.s[1]\n" /* outr00 = w4 * r1[1]*/ - "fmla v16.4s , %[w4].4s, v2.s[2]\n" /* outr01 = w4 * r1[2]*/ - "fmla v17.4s , %[w4].4s, v2.s[3]\n" /* outr02 = w4 * r1[3]*/ - "fmla v18.4s , %[w4].4s, v3.s[0]\n" /* outr03 = w4 * r1[4]*/ - "fmla v19.4s , %[w4].4s, v4.s[1]\n" /* outr10 = w4 * r2[1]*/ - "fmla v20.4s , %[w4].4s, v4.s[2]\n" /* outr11 = w4 * r2[2]*/ - "fmla v21.4s , %[w4].4s, v4.s[3]\n" /* outr12 = w4 * r2[3]*/ - "fmla v22.4s , %[w4].4s, v5.s[0]\n" /* outr13 = w4 * r2[4]*/ - - "ldp q6, q7, [%[r3]], #16 \n" /* load input r3*/ - - /* r1, r2, mul w5, get out r0, r1 */ - "fmla v15.4s , %[w5].4s, v2.s[2]\n" /* outr00 = w5 * r1[2]*/ - "fmla v16.4s , %[w5].4s, v2.s[3]\n" /* outr01 = w5 * r1[3]*/ - "fmla v17.4s , %[w5].4s, v3.s[0]\n" /* outr02 = w5 * r1[0]*/ - "fmla v18.4s , %[w5].4s, v3.s[1]\n" /* outr03 = w5 * r1[1]*/ - "fmla v19.4s , %[w5].4s, v4.s[2]\n" /* outr10 = w5 * r2[2]*/ - "fmla v20.4s , %[w5].4s, v4.s[3]\n" /* outr11 = w5 * r2[3]*/ - "fmla v21.4s , %[w5].4s, v5.s[0]\n" /* outr12 = w5 * r2[0]*/ - "fmla v22.4s , %[w5].4s, v5.s[1]\n" /* outr13 = w5 * r2[1]*/ - - /* r2, r3, mul w6, get out r0, r1 */ - "fmla v15.4s , %[w6].4s, v4.s[0]\n" /* outr00 = w6 * r2[0]*/ - "fmla v16.4s , %[w6].4s, v4.s[1]\n" /* outr01 = w6 * r2[1]*/ - "fmla v17.4s , %[w6].4s, v4.s[2]\n" /* outr02 = w6 * r2[2]*/ - "fmla v18.4s , %[w6].4s, v4.s[3]\n" /* outr03 = w6 * r2[3]*/ - "fmla v19.4s , %[w6].4s, v6.s[0]\n" /* outr10 = w6 * r3[0]*/ - "fmla v20.4s , %[w6].4s, v6.s[1]\n" /* outr11 = w6 * r3[1]*/ - "fmla v21.4s , %[w6].4s, v6.s[2]\n" /* outr12 = w6 * r3[2]*/ - "fmla v22.4s , %[w6].4s, v6.s[3]\n" /* outr13 = w6 * r3[3]*/ - - "ldp q2, q3, [%[r1]], #16 \n" /* load next input r1*/ - - /* r2, r3, mul w7, get out r0, r1 */ - "fmla v15.4s , %[w7].4s, v4.s[1]\n" /* outr00 = w7 * r2[1]*/ - "fmla v16.4s , %[w7].4s, v4.s[2]\n" /* outr01 = w7 * r2[2]*/ - "fmla v17.4s , %[w7].4s, v4.s[3]\n" /* outr02 = w7 * r2[3]*/ - "fmla v18.4s , %[w7].4s, v5.s[0]\n" /* outr03 = w7 * r2[4]*/ - "fmla v19.4s , %[w7].4s, v6.s[1]\n" /* outr10 = w7 * r3[1]*/ - "fmla v20.4s , %[w7].4s, v6.s[2]\n" /* outr11 = w7 * r3[2]*/ - "fmla v21.4s , %[w7].4s, v6.s[3]\n" /* outr12 = w7 * r3[3]*/ - "fmla v22.4s , %[w7].4s, v7.s[0]\n" /* outr13 = w7 * r3[4]*/ - - "subs %w[cnt], %w[cnt], #1 \n" /*loop count -1*/ - - /* r2, r3, mul w8, get out r0, r1 */ - "fmla v15.4s , %[w8].4s, v4.s[2]\n" /* outr00 = w8 * r2[2]*/ - "fmla v16.4s , %[w8].4s, v4.s[3]\n" /* outr01 = w8 * r2[3]*/ - "fmla v17.4s , %[w8].4s, v5.s[0]\n" /* outr02 = w8 * r2[0]*/ - "fmla v18.4s , %[w8].4s, v5.s[1]\n" /* outr03 = w8 * r2[1]*/ - - "stp q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/ - "fmla v19.4s , %[w8].4s, v6.s[2]\n" /* outr10 = w8 * r3[2]*/ - "stp q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/ - "fmla v20.4s , %[w8].4s, v6.s[3]\n" /* outr11 = w8 * r3[3]*/ - "ldp q15, q16, [%[ptr_out0]] \n" /* load outr00, outr01*/ - "fmla v21.4s , %[w8].4s, v7.s[0]\n" /* outr12 = w8 * r3[0]*/ - "ldp q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/ - "fmla v22.4s , %[w8].4s, v7.s[1]\n" /* outr13 = w8 * r3[1]*/ - "stp q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/ - "stp q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/ - "ldp q19, q20, [%[ptr_out1]] \n" /* load outr10, outr11*/ - "ldp q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/ - "bne 2b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), - [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3), - [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7), - [w8] "w"(w8) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22"); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } -#else // not __aarch64__ - for (int i = 0; i < chin; ++i) { - const float *wc0 = weight_c + i * w_stride_chin; - - float *ptr_out0 = pre_out0; - float *ptr_out1 = pre_out1; - - const float *r0 = inr0; - const float *r1 = inr1; - const float *r2 = inr2; - const float *r3 = inr3; - - int cnt = w_loop; - asm volatile( - "vld1.32 {d16-d19}, [%[ptr_out0]]! @ load " - "outr0, w0, w1, c0~c3\n" - "vld1.32 {d20-d23}, [%[ptr_out0]] @ load " - "outr0, w2, w3, c0~c3\n" - - /* load weights */ - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, " - "w1, to q5, q6\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, " - "to q7\n" - - /* load r0, r1 */ - "vld1.32 {d0-d1}, [%[r0]]! @ load r0, " - "4 float\n" - "vld1.32 {d2}, [%[r0]] @ load r0, " - "2 float\n" - - "sub %[ptr_out0], %[ptr_out0], #32 @ ptr_out0 " - "- 32, to start address\n" - - /* main loop */ - "0: @ main " - "loop\n" - /* mul r0 with w0, w1, w2, get out r0 */ - "vld1.32 {d24-d27}, [%[ptr_out1]]! @ load " - "outr1, w0, w1, c0~c3\n" - "vmla.f32 q8, q5, d0[0] @ w0 * " - "inr00\n" - "vld1.32 {d28-d31}, [%[ptr_out1]] @ load " - "outr1, w2, w3, c0~c3\n" - "vmla.f32 q9, q5, d0[1] @ w0 * " - "inr01\n" - "vmla.f32 q10, q5, d1[0] @ w0 * " - "inr02\n" - "vmla.f32 q11, q5, d1[1] @ w0 * " - "inr03\n" - "vld1.32 {d3-d4}, [%[r1]]! @ load r1, " - "4 float\n" - "vmla.f32 q8, q6, d0[1] @ w1 * " - "inr01\n" - "vmla.f32 q9, q6, d1[0] @ w1 * " - "inr02\n" - "vmla.f32 q10, q6, d1[1] @ w1 * " - "inr03\n" - "vmla.f32 q11, q6, d2[0] @ w1 * " - "inr04\n" - "vld1.32 {d5}, [%[r1]] @ load r0, " - "2 float\n" - "vmla.f32 q8, q7, d1[0] @ w2 * " - "inr02\n" - "vmla.f32 q9, q7, d1[1] @ w2 * " - "inr03\n" - "vmla.f32 q10, q7, d2[0] @ w2 * " - "inr04\n" - "vmla.f32 q11, q7, d2[1] @ w2 * " - "inr05\n" - - "sub %[ptr_out1], %[ptr_out1], #32 @ ptr_out1 " - "- 32, to start address\n" - - /* mul r1 with w0, w1, w2, get out r1 */ - "vmla.f32 q12, q5, d3[0] @ w0 * " - "inr10\n" - "vmla.f32 q13, q5, d3[1] @ w0 * " - "inr11\n" - "vmla.f32 q14, q5, d4[0] @ w0 * " - "inr12\n" - "vmla.f32 q15, q5, d4[1] @ w0 * " - "inr13\n" - "vmla.f32 q12, q6, d3[1] @ w1 * " - "inr11\n" - "vmla.f32 q13, q6, d4[0] @ w1 * " - "inr12\n" - "vmla.f32 q14, q6, d4[1] @ w1 * " - "inr13\n" - "vmla.f32 q15, q6, d5[0] @ w1 * " - "inr14\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w3, " - "w4, to q5, q6\n" - "vmla.f32 q12, q7, d4[0] @ w2 * " - "inr12\n" - "vmla.f32 q13, q7, d4[1] @ w2 * " - "inr13\n" - "vmla.f32 q14, q7, d5[0] @ w2 * " - "inr14\n" - "vmla.f32 q15, q7, d5[1] @ w2 * " - "inr15\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w5, " - "to q7\n" - - /* mul r1 with w3, w4, w5, get out r0 */ - "vmla.f32 q8, q5, d3[0] @ w3 * " - "inr10\n" - "vmla.f32 q9, q5, d3[1] @ w3 * " - "inr11\n" - "vmla.f32 q10, q5, d4[0] @ w3 * " - "inr12\n" - "vmla.f32 q11, q5, d4[1] @ w3 * " - "inr13\n" - "vld1.32 {d0-d1}, [%[r2]]! @ load r2, " - "4 float\n" - "vmla.f32 q8, q6, d3[1] @ w4 * " - "inr11\n" - "vmla.f32 q9, q6, d4[0] @ w4 * " - "inr12\n" - "vmla.f32 q10, q6, d4[1] @ w4 * " - "inr13\n" - "vmla.f32 q11, q6, d5[0] @ w4 * " - "inr14\n" - "vld1.32 {d2}, [%[r2]] @ load r2, " - "2 float\n" - "vmla.f32 q8, q7, d4[0] @ w5 * " - "inr12\n" - "vmla.f32 q9, q7, d4[1] @ w5 * " - "inr13\n" - "vmla.f32 q10, q7, d5[0] @ w5 * " - "inr14\n" - "vmla.f32 q11, q7, d5[1] @ w5 * " - "inr15\n" - - /* mul r2 with w3, w4, w5, get out r1 */ - "vmla.f32 q12, q5, d0[0] @ w3 * " - "inr20\n" - "vmla.f32 q13, q5, d0[1] @ w3 * " - "inr21\n" - "vmla.f32 q14, q5, d1[0] @ w3 * " - "inr22\n" - "vmla.f32 q15, q5, d1[1] @ w3 * " - "inr23\n" - "vmla.f32 q12, q6, d0[1] @ w4 * " - "inr21\n" - "vmla.f32 q13, q6, d1[0] @ w4 * " - "inr22\n" - "vmla.f32 q14, q6, d1[1] @ w4 * " - "inr23\n" - "vmla.f32 q15, q6, d2[0] @ w4 * " - "inr24\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w6, " - "w7, to q5, q6\n" - "vmla.f32 q12, q7, d1[0] @ w5 * " - "inr22\n" - "vmla.f32 q13, q7, d1[1] @ w5 * " - "inr23\n" - "vmla.f32 q14, q7, d2[0] @ w5 * " - "inr24\n" - "vmla.f32 q15, q7, d2[1] @ w5 * " - "inr25\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w8, " - "to q7\n" - - "sub %[wc0], %[wc0], #144 @ wc0 - " - "144 to start address\n" - - /* mul r2 with w6, w7, w8, get out r0 */ - "vmla.f32 q8, q5, d0[0] @ w6 * " - "inr20\n" - "vmla.f32 q9, q5, d0[1] @ w6 * " - "inr21\n" - "vld1.32 {d3-d4}, [%[r3]]! @ load r3, " - "4 float\n" - "vmla.f32 q10, q5, d1[0] @ w6 * " - "inr22\n" - "vmla.f32 q11, q5, d1[1] @ w6 * " - "inr23\n" - "vmla.f32 q8, q6, d0[1] @ w7 * " - "inr21\n" - "vmla.f32 q9, q6, d1[0] @ w7 * " - "inr22\n" - "vld1.32 {d5}, [%[r3]] @ load r3, " - "2 float\n" - "vmla.f32 q10, q6, d1[1] @ w7 * " - "inr23\n" - "vmla.f32 q11, q6, d2[0] @ w7 * " - "inr24\n" - "vmla.f32 q8, q7, d1[0] @ w8 * " - "inr22\n" - "vmla.f32 q9, q7, d1[1] @ w8 * " - "inr23\n" - "vld1.32 {d0-d1}, [%[r0]]! @ load r0, " - "4 float\n" - "vmla.f32 q10, q7, d2[0] @ w8 * " - "inr24\n" - "vmla.f32 q11, q7, d2[1] @ w8 * " - "inr25\n" - "vld1.32 {d2}, [%[r0]] @ load r0, " - "2 float\n" - - /* mul r3 with w6, w7, w8, get out r1 */ - "vmla.f32 q12, q5, d3[0] @ w6 * " - "inr20\n" - "vmla.f32 q13, q5, d3[1] @ w6 * " - "inr21\n" - "vst1.32 {d16-d19}, [%[ptr_out0]]! @ save " - "r00, r01, c0~c3\n" - "vmla.f32 q14, q5, d4[0] @ w6 * " - "inr22\n" - "vmla.f32 q15, q5, d4[1] @ w6 * " - "inr23\n" - "vst1.32 {d20-d23}, [%[ptr_out0]]! @ save " - "r02, r03, c0~c3\n" - "vmla.f32 q12, q6, d3[1] @ w7 * " - "inr21\n" - "vmla.f32 q13, q6, d4[0] @ w7 * " - "inr22\n" - "vld1.32 {d16-d19}, [%[ptr_out0]]! @ load " - "outr0, w0, w1, c0~c3\n" - "vmla.f32 q14, q6, d4[1] @ w7 * " - "inr23\n" - "vmla.f32 q15, q6, d5[0] @ w7 * " - "inr24\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, " - "w1, to q5, q6\n" - "vmla.f32 q12, q7, d4[0] @ w8 * " - "inr22\n" - "vmla.f32 q13, q7, d4[1] @ w8 * " - "inr23\n" - "vld1.32 {d20-d23}, [%[ptr_out0]] @ load " - "outr0, w2, w3, c0~c3\n" - "vmla.f32 q14, q7, d5[0] @ w8 * " - "inr24\n" - "vmla.f32 q15, q7, d5[1] @ w8 * " - "inr25\n" - - "vst1.32 {d24-d27}, [%[ptr_out1]]! @ save " - "r10, r11, c0~c3\n" - "vst1.32 {d28-d31}, [%[ptr_out1]]! @ save " - "r12, r13, c0~c3\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, " - "to q7\n" - - "sub %[ptr_out0], %[ptr_out0], #32 @ ptr_out0 " - "- 32, to start address\n" - - "subs %[cnt], #1 @ loop " - "count--\n" - "bne 0b @ jump to " - "main loop\n" - - : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), - [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1), [wc0] "+r"(wc0) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); - - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } -#endif // __aarch64__ - block_inr0 = block_inr2; - block_inr1 = block_inr3; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - } - slidingwindow_writeout_c4_fp32(pre_out, dout_batch, c, c + hout_c_block, - h, h + h_kernel, 0, wout_round, chout, - hout, wout, relu, ptr_write); - } - const float *weight_remain_ptr = weights + c_round_down * w_stride; -#pragma omp parallel for - for (int c = 0; c < c_remain; ++c) { -#ifdef USE_OPENMP - float *pre_out = - pre_din + pre_in_size + omp_get_thread_num() * pre_out_size; -#else - float *pre_out = pre_din + pre_in_size; -#endif - - int c_idx = c_round_down + c; - - int h_kernel = hout_r_block; - if (h + hout_r_block > hout) { - h_kernel = hout - h; - } - - const float *block_inr0 = pre_din; - const float *block_inr1 = block_inr0 + in_len; - const float *block_inr2 = block_inr1 + in_len; - const float *block_inr3 = block_inr2 + in_len; - - const float *bias_ptr = ptr_zero; - if (bias != nullptr) { - bias_ptr = bias + c_idx; - } - slidingwindow_fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel); - - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const float *wc0 = weight_remain_ptr; - - const float *inr0 = block_inr0; - const float *inr1 = block_inr1; - const float *inr2 = block_inr2; - const float *inr3 = block_inr3; - - float *pre_out0 = pre_out + hk * wout_round; - float *pre_out1 = pre_out0 + wout_round; -#ifdef __aarch64__ - for (int i = 0; i < chin; ++i) { - float *ptr_out0 = pre_out0; - float *ptr_out1 = pre_out1; - - float32x4_t w0 = vdupq_n_f32(wc0[c]); // w0, v23 - float32x4_t w1 = vdupq_n_f32(wc0[4 + c]); // w1, v24 - float32x4_t w2 = vdupq_n_f32(wc0[8 + c]); // w2, v25 - float32x4_t w3 = vdupq_n_f32(wc0[12 + c]); // w3, v26 - float32x4_t w4 = vdupq_n_f32(wc0[16 + c]); // w4, v27 - float32x4_t w5 = vdupq_n_f32(wc0[20 + c]); // w5, v28 - float32x4_t w6 = vdupq_n_f32(wc0[24 + c]); // w6, v29 - float32x4_t w7 = vdupq_n_f32(wc0[28 + c]); // w7, v30 - float32x4_t w8 = vdupq_n_f32(wc0[32 + c]); // w8, v31 - - const float *r0 = inr0; - const float *r1 = inr1; - const float *r2 = inr2; - const float *r3 = inr3; - - int cnt = w_loop; - asm volatile( - "ldr q21, [%[ptr_out0]] \n" /* load outr0, w0~w3*/ - "ldr q22, [%[ptr_out1]] \n" /* load outr1, w0~w3*/ - "ldp q0, q1, [%[r0]], #16 \n" /* load input r0*/ - "ldp q2, q3, [%[r1]], #16 \n" /* load input r1*/ - "ldp q4, q5, [%[r2]], #16 \n" /* load input r2*/ - "ldp q6, q7, [%[r3]], #16 \n" /* load input r3*/ - "2: \n" /* main loop*/ - - "fmla v21.4s , %[w0].4s, v0.4s \n" /* outr0 = w0 * r0*/ - "fmla v22.4s , %[w0].4s, v2.4s \n" /* outr1 = w0 * r1*/ - - "ext v8.16b, v0.16b, v1.16b, #4 \n" /* shift r0 left 1*/ - "ext v10.16b, v2.16b, v3.16b, #4 \n" /* shift r1 left 1*/ - "ext v9.16b, v0.16b, v1.16b, #8 \n" /* shift r0 left 2*/ - "ext v11.16b, v2.16b, v3.16b, #8 \n" /* shift r1 left 2*/ - - "ldp q0, q1, [%[r0]], #16 \n" /* load input r0*/ - - "fmla v21.4s , %[w1].4s, v8.4s \n" /* outr0 = w1 * r1*/ - "fmla v22.4s , %[w1].4s, v10.4s \n" /* outr1 = w1 * r2*/ - - "fmla v21.4s , %[w2].4s, v9.4s \n" /* outr0 = w2 * r1*/ - "fmla v22.4s , %[w2].4s, v11.4s \n" /* outr1 = w2 * r2*/ - - "fmla v21.4s , %[w3].4s, v2.4s \n" /* outr0 = w3 * r1*/ - "fmla v22.4s , %[w3].4s, v4.4s \n" /* outr1 = w3 * r2*/ - - "ext v12.16b, v4.16b, v5.16b, #4\n" /* shift r2 left 1*/ - "ext v14.16b, v6.16b, v7.16b, #4\n" /* shift r3 left 1*/ - "ext v13.16b, v4.16b, v5.16b, #8\n" /* shift r2 left 2*/ - "ext v15.16b, v6.16b, v7.16b, #8\n" /* shift r3 left 2*/ - - "fmla v21.4s , %[w4].4s, v10.4s \n" /* outr0 = w4 * r1*/ - "fmla v22.4s , %[w4].4s, v12.4s \n" /* outr1 = w4 * r2*/ - - "fmla v21.4s , %[w5].4s, v11.4s \n" /* outr0 = w5 * r1*/ - "fmla v22.4s , %[w5].4s, v13.4s \n" /* outr1 = w5 * r2*/ - - "ldp q2, q3, [%[r1]], #16 \n" /* load input r0*/ - - "fmla v21.4s , %[w6].4s, v4.4s \n" /* outr0 = w6 * r2*/ - "fmla v22.4s , %[w6].4s, v6.4s \n" /* outr1 = w6 * r3*/ - - "ldp q4, q5, [%[r2]], #16 \n" /* load input r2*/ - - "fmla v21.4s , %[w7].4s, v12.4s \n" /* outr0 = w7 * r1*/ - "fmla v22.4s , %[w7].4s, v14.4s \n" /* outr1 = w7 * r2*/ - - "ldp q6, q7, [%[r3]], #16 \n" /* load input r3*/ - - "fmla v21.4s , %[w8].4s, v13.4s \n" /* outr0 = w8 * r1*/ - "fmla v22.4s , %[w8].4s, v15.4s \n" /* outr1 = w8 * r2*/ - - "str q21, [%[ptr_out0]], #16 \n" /*write output r0*/ - "str q22, [%[ptr_out1]], #16 \n" /*write output r1*/ - - "subs %w[cnt], %w[cnt], #1 \n" /*loop count -1*/ - - "ldr q21, [%[ptr_out0]] \n" /* load outr0, w0~w3*/ - "ldr q22, [%[ptr_out1]] \n" /* load outr1, w0~w3*/ - - "bne 2b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), - [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3), - [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7), - [w8] "w"(w8) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v21", "v22"); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } -#else // not __aarch64__ - for (int i = 0; i < chin; ++i) { - float *ptr_out0 = pre_out0; - float *ptr_out1 = pre_out1; - - //! get valid weights of current output channel - float w_tmp[10] = { - wc0[c], wc0[c + 4], wc0[c + 8], wc0[c + 12], wc0[c + 16], - wc0[c + 20], wc0[c + 24], wc0[c + 28], wc0[c + 32], 0.f}; - float32x4_t w0 = vld1q_f32(w_tmp); // w0, w1, w2, q0 - float32x4_t w1 = vld1q_f32(w_tmp + 3); // w3, w4, w5, q1 - float32x4_t w2 = vld1q_f32(w_tmp + 6); // w6, w7, w8, q2 - - const float *r0 = inr0; - const float *r1 = inr1; - const float *r2 = inr2; - const float *r3 = inr3; - int cnt = w_loop / 2; - if (cnt > 0) { - asm volatile( - "vld1.32 {d24-d27}, [%[ptr_out0]] @ load or00, " - "or01\n" - "vld1.32 {d6-d9}, [%[r0]]! @ load r0, 8 " - "float\n" - "vld1.32 {d10}, [%[r0]] @ load r0, 2 " - "float\n" - /* main loop */ - "0: @ main loop\n" - /* r0 * w0, w1, w2, get out r0*/ - "vld1.32 {d28-d31}, [%[ptr_out1]] @ load or10, " - "or11\n" - "vext.32 q8, q3, q4, #1 @ r0, shift " - "left 1, get 1, 2, 3, 4\n" - "vext.32 q9, q4, q5, #1 @ r0, shift " - "left 1, get 5, 6, 7, 8\n" - "vmla.f32 q12, q3, %e[w0][0] @ w00 * r0, " - "0, 1, 2, 3\n" - "vmla.f32 q13, q4, %e[w0][0] @ w00 * r0, " - "4, 5, 6, 7\n" - "vext.32 q10, q3, q4, #2 @ r0, shift " - "left 2, get 2, 3, 4, 5\n" - "vext.32 q11, q4, q5, #2 @ r0, shift " - "left 2, get 6, 7, 8, 9\n" - "vmla.f32 q12, q8, %e[w0][1] @ w01 * r0, " - "1, 2, 3, 4\n" - "vmla.f32 q13, q9, %e[w0][1] @ w01 * r0, " - "5, 6, 7, 8\n" - "vld1.32 {d6-d9}, [%[r1]]! @ load r1, 8 " - "float\n" - "vmla.f32 q12, q10, %f[w0][0] @ w02 * r0, " - "2, 3, 4, 5\n" - "vmla.f32 q13, q11, %f[w0][0] @ w02 * r0, " - "6, 7, 8, 9\n" - "vld1.32 {d10}, [%[r1]] @ load r1, 2 " - "float\n" - - /* r1 * w3, w4, w5, get out r0*/ - /* r1 * w0, w1, w2, get out r1*/ - "vmla.f32 q12, q3, %e[w1][0] @ w10 * r1, " - "0, 1, 2, 3\n" - "vmla.f32 q13, q4, %e[w1][0] @ w10 * r1, " - "4, 5, 6, 7\n" - "vext.32 q8, q3, q4, #1 @ r1, shift " - "left 1, get 1, 2, 3, 4\n" - "vext.32 q9, q4, q5, #1 @ r1, shift " - "left 1, get 5, 6, 7, 8\n" - "vmla.f32 q14, q3, %e[w0][0] @ w00 * r1, " - "0, 1, 2, 3\n" - "vmla.f32 q15, q4, %e[w0][0] @ w00 * r1, " - "4, 5, 6, 7\n" - "vext.32 q10, q3, q4, #2 @ r1, shift " - "left 2, get 2, 3, 4, 5\n" - "vext.32 q11, q4, q5, #2 @ r1, shift " - "left 2, get 6, 7, 8, 9\n" - "vmla.f32 q12, q8, %e[w1][1] @ w11 * r1, " - "1, 2, 3, 4\n" - "vmla.f32 q13, q9, %e[w1][1] @ w11 * r1, " - "5, 6, 7, 8\n" - "vmla.f32 q14, q8, %e[w0][1] @ w01 * r1, " - "1, 2, 3, 4\n" - "vmla.f32 q15, q9, %e[w0][1] @ w01 * r1, " - "5, 6, 7, 8\n" - "vld1.32 {d6-d9}, [%[r2]]! @ load r2, 8 " - "float\n" - "vmla.f32 q12, q10, %f[w1][0] @ w12 * r1, " - "2, 3, 4, 5\n" - "vmla.f32 q13, q11, %f[w1][0] @ w12 * r1, " - "6, 7, 8, 9\n" - "vmla.f32 q14, q10, %f[w0][0] @ w02 * r1, " - "2, 3, 4, 5\n" - "vmla.f32 q15, q11, %f[w0][0] @ w02 * r1, " - "6, 7, 8, 9\n" - "vld1.32 {d10}, [%[r2]] @ load r2, 2 " - "float\n" - - /* r2 * w6, w7, w8, get out r0*/ - /* r2 * w3, w4, w5, get out r1*/ - "vmla.f32 q12, q3, %e[w2][0] @ w20 * r2, " - "0, 1, 2, 3\n" - "vmla.f32 q13, q4, %e[w2][0] @ w20 * r2, " - "4, 5, 6, 7\n" - "vext.32 q8, q3, q4, #1 @ r2, shift " - "left 1, get 1, 2, 3, 4\n" - "vext.32 q9, q4, q5, #1 @ r2, shift " - "left 1, get 5, 6, 7, 8\n" - "vmla.f32 q14, q3, %e[w1][0] @ w10 * r2, " - "0, 1, 2, 3\n" - "vmla.f32 q15, q4, %e[w1][0] @ w10 * r2, " - "4, 5, 6, 7\n" - "vext.32 q10, q3, q4, #2 @ r2, shift " - "left 2, get 2, 3, 4, 5\n" - "vext.32 q11, q4, q5, #2 @ r2, shift " - "left 2, get 6, 7, 8, 9\n" - "vmla.f32 q12, q8, %e[w2][1] @ w21 * r2, " - "1, 2, 3, 4\n" - "vmla.f32 q13, q9, %e[w2][1] @ w21 * r2, " - "5, 6, 7, 8\n" - "vmla.f32 q14, q8, %e[w1][1] @ w11 * r2, " - "1, 2, 3, 4\n" - "vmla.f32 q15, q9, %e[w1][1] @ w11 * r2, " - "5, 6, 7, 8\n" - "vld1.32 {d6-d9}, [%[r3]]! @ load r3, 8 " - "float\n" - "vmla.f32 q12, q10, %f[w2][0] @ w22 * r2, " - "2, 3, 4, 5\n" - "vmla.f32 q13, q11, %f[w2][0] @ w22 * r2, " - "6, 7, 8, 9\n" - "vmla.f32 q14, q10, %f[w1][0] @ w12 * r2, " - "2, 3, 4, 5\n" - "vmla.f32 q15, q11, %f[w1][0] @ w12 * r2, " - "6, 7, 8, 9\n" - "vld1.32 {d10}, [%[r3]] @ load r3, 2 " - "float\n" - - /* r3 * w6, w7, w8, get out r1*/ - "vext.32 q8, q3, q4, #1 @ r3, shift " - "left 1, get 1, 2, 3, 4\n" - "vext.32 q9, q4, q5, #1 @ r3, shift " - "left 1, get 5, 6, 7, 8\n" - "vmla.f32 q14, q3, %e[w2][0] @ w20 * r3, " - "0, 1, 2, 3\n" - "vmla.f32 q15, q4, %e[w2][0] @ w20 * r3, " - "4, 5, 6, 7\n" - "vst1.32 {d24-d27}, [%[ptr_out0]]! @ save or00, " - "or01\n" - "vext.32 q10, q3, q4, #2 @ r3, shift " - "left 2, get 2, 3, 4, 5\n" - "vext.32 q11, q4, q5, #2 @ r3, shift " - "left 2, get 6, 7, 8, 9\n" - "vmla.f32 q14, q8, %e[w2][1] @ w21 * r3, " - "0, 1, 2, 3\n" - "vmla.f32 q15, q9, %e[w2][1] @ w21 * r3, " - "4, 5, 6, 7\n" - "vld1.32 {d24-d27}, [%[ptr_out0]] @ load or00, " - "or01\n" - "vld1.32 {d6-d9}, [%[r0]]! @ load r3, 8 " - "float\n" - "vmla.f32 q14, q10, %f[w2][0] @ w22 * r3, " - "2, 3, 4, 5\n" - "vmla.f32 q15, q11, %f[w2][0] @ w22 * r3, " - "6, 7, 8, 9\n" - "vld1.32 {d10}, [%[r0]] @ load r0, 2 " - "float\n" - "vst1.32 {d28-d31}, [%[ptr_out1]]! @ save or10, " - "or11\n" - - "subs %[cnt], #1 @loop count " - "-1\n" - "bne 0b @ jump to " - "main loop\n" - - : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), - [r2] "+r"(r2), [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2) - : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); - r0 -= 8; - } - //! deal with remain wout - if (w_loop & 1) { - ptr_out0[0] += - r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] + - r1[0] * w_tmp[3] + r1[1] * w_tmp[4] + r1[2] * w_tmp[5] + - r2[0] * w_tmp[6] + r2[1] * w_tmp[7] + r2[2] * w_tmp[8]; - - ptr_out0[1] += - r0[1] * w_tmp[0] + r0[2] * w_tmp[1] + r0[3] * w_tmp[2] + - r1[1] * w_tmp[3] + r1[2] * w_tmp[4] + r1[3] * w_tmp[5] + - r2[1] * w_tmp[6] + r2[2] * w_tmp[7] + r2[3] * w_tmp[8]; - - ptr_out0[2] += - r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] + - r1[2] * w_tmp[3] + r1[3] * w_tmp[4] + r1[4] * w_tmp[5] + - r2[2] * w_tmp[6] + r2[3] * w_tmp[7] + r2[4] * w_tmp[8]; - - ptr_out0[3] += - r0[3] * w_tmp[0] + r0[4] * w_tmp[1] + r0[5] * w_tmp[2] + - r1[3] * w_tmp[3] + r1[4] * w_tmp[4] + r1[5] * w_tmp[5] + - r2[3] * w_tmp[6] + r2[4] * w_tmp[7] + r2[5] * w_tmp[8]; - - ptr_out1[0] += - r1[0] * w_tmp[0] + r1[1] * w_tmp[1] + r1[2] * w_tmp[2] + - r2[0] * w_tmp[3] + r2[1] * w_tmp[4] + r2[2] * w_tmp[5] + - r3[0] * w_tmp[6] + r3[1] * w_tmp[7] + r3[2] * w_tmp[8]; - - ptr_out1[1] += - r1[1] * w_tmp[0] + r1[2] * w_tmp[1] + r1[3] * w_tmp[2] + - r2[1] * w_tmp[3] + r2[2] * w_tmp[4] + r2[3] * w_tmp[5] + - r3[1] * w_tmp[6] + r3[2] * w_tmp[7] + r3[3] * w_tmp[8]; - - ptr_out1[2] += - r1[2] * w_tmp[0] + r1[3] * w_tmp[1] + r1[4] * w_tmp[2] + - r2[2] * w_tmp[3] + r2[3] * w_tmp[4] + r2[4] * w_tmp[5] + - r3[2] * w_tmp[6] + r3[3] * w_tmp[7] + r3[4] * w_tmp[8]; - - ptr_out1[3] += - r1[3] * w_tmp[0] + r1[4] * w_tmp[1] + r1[5] * w_tmp[2] + - r2[3] * w_tmp[3] + r2[4] * w_tmp[4] + r2[5] * w_tmp[5] + - r3[3] * w_tmp[6] + r3[4] * w_tmp[7] + r3[5] * w_tmp[8]; - } - - wc0 += 36; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } -#endif // __aarch64__ - block_inr0 = block_inr2; - block_inr1 = block_inr3; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - } - slidingwindow_writeout_c1_fp32(pre_out, dout_batch, c_idx, c_idx + 1, h, - h + h_kernel, 0, wout_round, chout, hout, - wout, relu, ptr_write); - } - } - } -} - -template <> -void SlidingwindowConv3x3s2Faster( - const framework::Tensor *input, framework::Tensor *filter, - const std::vector &paddings, framework::Tensor *output, - const float *bias, bool is_bias, bool is_relu) { - const float *din = input->data(); - float *dout = output->mutable_data(); - const float *weights = filter->mutable_data(); - if (!is_bias) { - bias = nullptr; - } - bool relu = is_relu; - const int num = input->dims()[0]; - const int chin = input->dims()[1]; - const int hin = input->dims()[2]; - const int win = input->dims()[3]; - const int chout = output->dims()[1]; - const int hout = output->dims()[2]; - const int wout = output->dims()[3]; - const int pad_h = paddings[0]; - const int pad_w = paddings[1]; - const int threads = framework::CPUContext::Context()->get_thread_num(); - int l2_size = - framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float); - const int hout_c_block = 4; - const int hout_r_kernel = 2; - const int wout_block = 4; - const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block; - const int win_round = wout_round * 2 /*stride_w*/ + 1; - //! get h block - //! win_round * chin * hin_r_block + wout_round * hout_c_block * hout_r_block - //! * threads = l2_size win_round = 2 * wout_round + 1 hin_r_block = 2 * - //! hout_r_block + 1 - int hout_r_block = - (l2_size - 2 * wout_round * chin - chin) / - ((4 * wout_round + 2) * chin + wout_round * hout_c_block * threads); - hout_r_block = hout_r_block > hout ? hout : hout_r_block; - hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel; - hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block; - - const int hin_r_block = hout_r_block * 2 /*stride_h*/ + 1; - - float ptr_zero[win_round]; - memset(ptr_zero, 0, sizeof(float) * win_round); - float ptr_write[wout_round]; - - int in_len = win_round * chin; - int pre_in_size = hin_r_block * in_len; - int pre_out_size = hout_c_block * hout_r_block * wout_round; - - float *pre_din = - static_cast(framework::CPUContext::Context()->get_work_space( - (pre_in_size + threads * pre_out_size) * sizeof(float))); - - int size_in_channel = win * hin; - int size_out_channel = wout * hout; - int w_stride = chin * 9; /*kernel_w * kernel_h*/ - int w_stride_chin = hout_c_block * 9; // kernel_w * kernel_h * - - int ws = -pad_w; - int we = ws + win_round; - int w_loop = wout_round / 4; - - int c_remain = chout - (chout / hout_c_block) * hout_c_block; - int c_round_down = (chout / hout_c_block) * hout_c_block; - - int out_row_stride = hout_c_block * wout_round; - - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * chin * size_in_channel; - float *dout_batch = dout + n * chout * size_out_channel; - for (int h = 0; h < hout; h += hout_r_block) { - int h_kernel = hout_r_block; - if (h + hout_r_block > hout) { - h_kernel = hout - h; - } - - int hs = h * 2 /*stride_h*/ - pad_h; - int he = hs + h_kernel * 2 /*stride_h*/ + 1; - - slidingwindow_prepack_input(din_batch, pre_din, 0, chin, hs, he, ws, we, - chin, win, hin, ptr_zero); - - const float *cblock_inr0 = pre_din; - const float *cblock_inr1 = cblock_inr0 + in_len; - const float *cblock_inr2 = cblock_inr1 + in_len; - const float *cblock_inr3 = cblock_inr2 + in_len; - const float *cblock_inr4 = cblock_inr3 + in_len; - -#pragma omp parallel for - for (int c = 0; c < c_round_down; c += hout_c_block) { -#ifdef _OPENMP - float *pre_out = - pre_din + pre_in_size + omp_get_thread_num() * pre_out_size; -#else - float *pre_out = pre_din + pre_in_size; -#endif - const float *block_inr0 = cblock_inr0; - const float *block_inr1 = cblock_inr1; - const float *block_inr2 = cblock_inr2; - const float *block_inr3 = cblock_inr3; - const float *block_inr4 = cblock_inr4; - - const float *weight_c = weights + c * w_stride; - const float *bias_ptr = ptr_zero; - if (bias != nullptr) { - bias_ptr = bias + c; - } - slidingwindow_fill_bias(pre_out, bias_ptr, - wout_round * hout_c_block * h_kernel); - - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const float *wc0 = weight_c; - - const float *inr0 = block_inr0; - const float *inr1 = block_inr1; - const float *inr2 = block_inr2; - const float *inr3 = block_inr3; - const float *inr4 = block_inr4; - - float *pre_out0 = pre_out + hk * out_row_stride; - float *pre_out1 = pre_out0 + out_row_stride; -#ifdef __aarch64__ - for (int i = 0; i < chin; ++i) { - float *ptr_out0 = pre_out0; - float *ptr_out1 = pre_out1; - - float32x4_t w0 = vld1q_f32(wc0); // w0, v23 - float32x4_t w1 = vld1q_f32(wc0 + 4); // w1, v24 - float32x4_t w2 = vld1q_f32(wc0 + 8); // w2, v25 - float32x4_t w3 = vld1q_f32(wc0 + 12); // w3, v26 - float32x4_t w4 = vld1q_f32(wc0 + 16); // w4, v27 - float32x4_t w5 = vld1q_f32(wc0 + 20); // w5, v28 - float32x4_t w6 = vld1q_f32(wc0 + 24); // w6, v29 - float32x4_t w7 = vld1q_f32(wc0 + 28); // w7, v30 - float32x4_t w8 = vld1q_f32(wc0 + 32); // w8, v31 - - const float *r0 = inr0; - const float *r1 = inr1; - const float *r2 = inr2; - const float *r3 = inr3; - const float *r4 = inr4; - - int cnt = w_loop; - asm volatile( - "ldp q15, q16, [%[ptr_out0]] \n" /* load outr00, outr01*/ - "ldp q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/ - - "ldp q0, q1, [%[r0]], #32 \n" /* load input r0*/ - "ldr d10, [%[r0]] \n" /* load input r0, 9th - element*/ - "ldp q4, q5, [%[r2]], #32 \n" /* load input r2*/ - "ldr d12, [%[r2]] \n" /* load input r2, 9th - element*/ - "2: \n" /* main loop*/ - /* r0, r2, mul w0, get out r0, r1 */ - "ldp q19, q20, [%[ptr_out1]] \n" /* load outr10, outr11*/ - "ldp q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/ - "fmla v15.4s , %[w0].4s, v0.s[0]\n" /* outr00 = w0 * r0[0]*/ - "fmla v16.4s , %[w0].4s, v0.s[2]\n" /* outr01 = w0 * r0[2]*/ - "fmla v17.4s , %[w0].4s, v1.s[0]\n" /* outr02 = w0 * r0[4]*/ - "fmla v18.4s , %[w0].4s, v1.s[2]\n" /* outr03 = w0 * r0[6]*/ - "fmla v19.4s , %[w0].4s, v4.s[0]\n" /* outr10 = w0 * r2[0]*/ - "fmla v20.4s , %[w0].4s, v4.s[2]\n" /* outr11 = w0 * r2[2]*/ - "fmla v21.4s , %[w0].4s, v5.s[0]\n" /* outr12 = w0 * r2[4]*/ - "fmla v22.4s , %[w0].4s, v5.s[2]\n" /* outr13 = w0 * r2[6]*/ - - "ldp q2, q3, [%[r1]], #32 \n" /* load input r1*/ - - /* r2 mul w6, get out r0*/ - "fmla v15.4s , %[w6].4s, v4.s[0]\n" /* outr00 = w6 * r2[0]*/ - "fmla v16.4s , %[w6].4s, v4.s[2]\n" /* outr01 = w6 * r2[2]*/ - "fmla v17.4s , %[w6].4s, v5.s[0]\n" /* outr02 = w6 * r2[4]*/ - "fmla v18.4s , %[w6].4s, v5.s[2]\n" /* outr03 = w6 * r2[6]*/ - - "ldr d11, [%[r1]] \n" /* load input r1, 9th - element*/ - - /* r0, r2, mul w1, get out r0, r1 */ - "fmla v15.4s , %[w1].4s, v0.s[1]\n" /* outr00 = w1 * r0[1]*/ - "fmla v16.4s , %[w1].4s, v0.s[3]\n" /* outr01 = w1 * r0[3]*/ - "fmla v17.4s , %[w1].4s, v1.s[1]\n" /* outr02 = w1 * r0[5]*/ - "fmla v18.4s , %[w1].4s, v1.s[3]\n" /* outr03 = w1 * r0[7]*/ - "fmla v19.4s , %[w1].4s, v4.s[1]\n" /* outr10 = w1 * r2[1]*/ - "fmla v20.4s , %[w1].4s, v4.s[3]\n" /* outr11 = w1 * r2[3]*/ - "fmla v21.4s , %[w1].4s, v5.s[1]\n" /* outr12 = w1 * r2[5]*/ - "fmla v22.4s , %[w1].4s, v5.s[3]\n" /* outr13 = w1 * r2[7]*/ - - "ldp q6, q7, [%[r3]], #32 \n" /* load input r3*/ - - /* r2 mul w7, get out r0 */ - "fmla v15.4s , %[w7].4s, v4.s[1]\n" /* outr00 = w7 * r2[1]*/ - "fmla v16.4s , %[w7].4s, v4.s[3]\n" /* outr01 = w7 * r2[3]*/ - "fmla v17.4s , %[w7].4s, v5.s[1]\n" /* outr02 = w7 * r2[5]*/ - "fmla v18.4s , %[w7].4s, v5.s[3]\n" /* outr03 = w7 * r2[7]*/ - - "ldr d13, [%[r3]] \n" /* load input r3, 9th - element*/ - - /* r0, r2, mul w2, get out r0, r1 */ - "fmla v15.4s , %[w2].4s, v0.s[2]\n" /* outr00 = w2 * r0[2]*/ - "fmla v16.4s , %[w2].4s, v1.s[0]\n" /* outr01 = w2 * r0[4]*/ - "fmla v17.4s , %[w2].4s, v1.s[2]\n" /* outr02 = w2 * r0[6]*/ - "fmla v18.4s , %[w2].4s, v10.s[0]\n" /* outr03 = w2 * - r0[8]*/ - "fmla v19.4s , %[w2].4s, v4.s[2]\n" /* outr10 = w2 * r2[2]*/ - "fmla v20.4s , %[w2].4s, v5.s[0]\n" /* outr11 = w2 * r2[4]*/ - "fmla v21.4s , %[w2].4s, v5.s[2]\n" /* outr12 = w2 * r2[6]*/ - "fmla v22.4s , %[w2].4s, v12.s[0]\n" /* outr13 = w2 * - r2[8]*/ - - "ldp q8, q9, [%[r4]], #32 \n" /* load input r4*/ - - /* r2, mul w8, get out r0 */ - "fmla v15.4s , %[w8].4s, v4.s[2]\n" /* outr00 = w8 * r2[2]*/ - "fmla v16.4s , %[w8].4s, v5.s[0]\n" /* outr01 = w8 * r2[4]*/ - "fmla v17.4s , %[w8].4s, v5.s[2]\n" /* outr02 = w8 * r2[6]*/ - "fmla v18.4s , %[w8].4s, v12.s[0]\n" /* outr03 = w8 * - r2[8]*/ - - "ldr d14, [%[r4]] \n" /* load input r4, 9th - element*/ - - /* r1, r3, mul w3, get out r0, r1 */ - "fmla v15.4s , %[w3].4s, v2.s[0]\n" /* outr00 = w3 * r1[0]*/ - "fmla v16.4s , %[w3].4s, v2.s[2]\n" /* outr01 = w3 * r1[2]*/ - "fmla v17.4s , %[w3].4s, v3.s[0]\n" /* outr02 = w3 * r1[4]*/ - "fmla v18.4s , %[w3].4s, v3.s[2]\n" /* outr03 = w3 * r1[6]*/ - "fmla v19.4s , %[w3].4s, v6.s[0]\n" /* outr10 = w3 * r3[0]*/ - "fmla v20.4s , %[w3].4s, v6.s[2]\n" /* outr11 = w3 * r3[2]*/ - "fmla v21.4s , %[w3].4s, v7.s[0]\n" /* outr12 = w3 * r3[4]*/ - "fmla v22.4s , %[w3].4s, v7.s[2]\n" /* outr13 = w3 * r3[6]*/ - - "ldp q0, q1, [%[r0]], #32 \n" /* load input r0*/ - - /* r1, r3, mul w4, get out r0, r1 */ - "fmla v15.4s , %[w4].4s, v2.s[1]\n" /* outr00 = w4 * r1[1]*/ - "fmla v16.4s , %[w4].4s, v2.s[3]\n" /* outr01 = w4 * r1[3]*/ - "fmla v17.4s , %[w4].4s, v3.s[1]\n" /* outr02 = w4 * r1[5]*/ - "fmla v18.4s , %[w4].4s, v3.s[3]\n" /* outr03 = w4 * r1[7]*/ - "fmla v19.4s , %[w4].4s, v6.s[1]\n" /* outr10 = w4 * r3[1]*/ - "fmla v20.4s , %[w4].4s, v6.s[3]\n" /* outr11 = w4 * r3[3]*/ - "fmla v21.4s , %[w4].4s, v7.s[1]\n" /* outr12 = w4 * r3[5]*/ - "fmla v22.4s , %[w4].4s, v7.s[3]\n" /* outr13 = w4 * r3[7]*/ - - "ldr d10, [%[r0]] \n" /* load input r0, 9th - element*/ - - /* r1, r3, mul w5, get out r0, r1 */ - "fmla v15.4s , %[w5].4s, v2.s[2]\n" /* outr00 = w5 * r1[2]*/ - "fmla v16.4s , %[w5].4s, v3.s[0]\n" /* outr01 = w5 * r1[4]*/ - "fmla v17.4s , %[w5].4s, v3.s[2]\n" /* outr02 = w5 * r1[6]*/ - "fmla v18.4s , %[w5].4s, v11.s[0]\n" /* outr03 = w5 * - r1[8]*/ - - "ldp q4, q5, [%[r2]], #32 \n" /* load input r2*/ - "stp q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/ - - "fmla v19.4s , %[w5].4s, v6.s[2]\n" /* outr10 = w5 * r3[2]*/ - "fmla v20.4s , %[w5].4s, v7.s[0]\n" /* outr11 = w5 * r3[4]*/ - "fmla v21.4s , %[w5].4s, v7.s[2]\n" /* outr12 = w5 * r3[6]*/ - "fmla v22.4s , %[w5].4s, v13.s[0]\n" /* outr13 = w5 * - r3[8]*/ - - "ldr d12, [%[r2]] \n" /* load input r2, 9th - element*/ - "stp q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/ - - /* r4, mul w6, get out r1 */ - "fmla v19.4s , %[w6].4s, v8.s[0]\n" /* outr10 = w6 * r4[0]*/ - "fmla v20.4s , %[w6].4s, v8.s[2]\n" /* outr11 = w6 * r4[2]*/ - "fmla v21.4s , %[w6].4s, v9.s[0]\n" /* outr12 = w6 * r4[4]*/ - "fmla v22.4s , %[w6].4s, v9.s[2]\n" /* outr13 = w6 * r4[6]*/ - - "ldp q15, q16, [%[ptr_out0]] \n" /* load outr00, outr01*/ - - /* r4, mul w7, get out r1 */ - "fmla v19.4s , %[w7].4s, v8.s[1]\n" /* outr10 = w7 * r4[1]*/ - "fmla v20.4s , %[w7].4s, v8.s[3]\n" /* outr11 = w7 * r4[3]*/ - "fmla v21.4s , %[w7].4s, v9.s[1]\n" /* outr12 = w7 * r4[5]*/ - "fmla v22.4s , %[w7].4s, v9.s[3]\n" /* outr13 = w7 * r4[7]*/ - - "ldp q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/ - - /* r4, mul w8, get out r1 */ - "fmla v19.4s , %[w8].4s, v8.s[2]\n" /* outr10 = w8 * r4[2]*/ - "fmla v20.4s , %[w8].4s, v9.s[0]\n" /* outr11 = w8 * r4[4]*/ - "fmla v21.4s , %[w8].4s, v9.s[2]\n" /* outr12 = w8 * r4[6]*/ - "fmla v22.4s , %[w8].4s, v14.s[0]\n" /* outr13 = w8 * - r4[8]*/ - - "subs %w[cnt], %w[cnt], #1 \n" /*loop count -1*/ - - "stp q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/ - "stp q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/ - - "bne 2b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), - [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3), - [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7), - [w8] "w"(w8) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v16", "v17", "v18", "v19", "v20", "v21", "v22"); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } -#else // not __aarch64__ - for (int i = 0; i < chin; ++i) { - const float *wc0 = weight_c + i * w_stride_chin; - - float *ptr_out0 = pre_out0; - float *ptr_out1 = pre_out1; - - const float *r0 = inr0; - const float *r1 = inr1; - const float *r2 = inr2; - const float *r3 = inr3; - const float *r4 = inr4; - - int cnt = w_loop; - asm volatile( - "vld1.32 {d16-d19}, [%[ptr_out0]]! @ load " - "outr0, w0, w1, c0~c3\n" - "vld1.32 {d20-d23}, [%[ptr_out0]] @ load " - "outr0, w2, w3, c0~c3\n" - - /* load weights */ - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, " - "w1, to q5, q6\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, " - "to q7\n" - - /* load r0, r2 */ - "vld1.32 {d0-d3}, [%[r0]]! @ load r0, " - "8 float\n" - "vld1.32 {d8}, [%[r0]] @ load r0, " - "9th float\n" - - "sub %[ptr_out0], %[ptr_out0], #32 @ ptr_out0 " - "- 32, to start address\n" - - /* main loop */ - "0: @ main " - "loop\n" - /* mul r0, with w0, w1, w2 */ - "vld1.32 {d24-d27}, [%[ptr_out1]]! @ load " - "outr1, w0, w1, c0~c3\n" - "vmla.f32 q8, q5, d0[0] @ w0 * " - "inr00\n" - "vld1.32 {d28-d31}, [%[ptr_out1]] @ load " - "outr1, w2, w3, c0~c3\n" - "vmla.f32 q9, q5, d1[0] @ w0 * " - "inr02\n" - "vmla.f32 q10, q5, d2[0] @ w0 * " - "inr04\n" - "vmla.f32 q11, q5, d3[0] @ w0 * " - "inr06\n" - "vld1.32 {d4-d7}, [%[r2]]! @ load r2, " - "8 float\n" - "vmla.f32 q8, q6, d0[1] @ w1 * " - "inr01\n" - "vmla.f32 q9, q6, d1[1] @ w1 * " - "inr03\n" - "vmla.f32 q10, q6, d2[1] @ w1 * " - "inr05\n" - "vmla.f32 q11, q6, d3[1] @ w1 * " - "inr07\n" - "vld1.32 {d9}, [%[r2]] @ load r2, " - "9th float\n" - "vmla.f32 q8, q7, d1[0] @ w2 * " - "inr02\n" - "vmla.f32 q9, q7, d2[0] @ w2 * " - "inr04\n" - "vmla.f32 q10, q7, d3[0] @ w2 * " - "inr06\n" - "vmla.f32 q11, q7, d8[0] @ w2 * " - "inr08\n" - - "sub %[r2], %[r2], #32 @ r2 - 32, " - "load r2 twice\n" - - /* mul r2, with w0, w1, w2 */ - "vld1.32 {d0-d3}, [%[r1]]! @ load r1, " - "8 float\n" - "vmla.f32 q12, q5, d4[0] @ w0 * " - "inr20\n" - "vmla.f32 q13, q5, d5[0] @ w0 * " - "inr22\n" - "vmla.f32 q14, q5, d6[0] @ w0 * " - "inr24\n" - "vmla.f32 q15, q5, d7[0] @ w0 * " - "inr26\n" - "vld1.32 {d8}, [%[r1]] @ load r1, " - "9th float\n" - "vmla.f32 q12, q6, d4[1] @ w1 * " - "inr21\n" - "vmla.f32 q13, q6, d5[1] @ w1 * " - "inr23\n" - "vmla.f32 q14, q6, d6[1] @ w1 * " - "inr25\n" - "vmla.f32 q15, q6, d7[1] @ w1 * " - "inr27\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w3, " - "w4, to q5, q6\n" - "vmla.f32 q12, q7, d5[0] @ w2 * " - "inr22\n" - "vmla.f32 q13, q7, d6[0] @ w2 * " - "inr24\n" - "vmla.f32 q14, q7, d7[0] @ w2 * " - "inr26\n" - "vmla.f32 q15, q7, d9[0] @ w2 * " - "inr28\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w5, " - "to q7\n" - - /* mul r1, with w3, w4, w5 */ - "vmla.f32 q8, q5, d0[0] @ w3 * " - "inr10\n" - "vmla.f32 q9, q5, d1[0] @ w3 * " - "inr12\n" - "vmla.f32 q10, q5, d2[0] @ w3 * " - "inr14\n" - "vmla.f32 q11, q5, d3[0] @ w3 * " - "inr16\n" - "vld1.32 {d4-d7}, [%[r3]]! @ load r3, " - "8 float\n" - "vmla.f32 q8, q6, d0[1] @ w4 * " - "inr11\n" - "vmla.f32 q9, q6, d1[1] @ w4 * " - "inr13\n" - "vmla.f32 q10, q6, d2[1] @ w4 * " - "inr15\n" - "vmla.f32 q11, q6, d3[1] @ w4 * " - "inr17\n" - "vld1.32 {d9}, [%[r3]] @ load r3, " - "9th float\n" - "vmla.f32 q8, q7, d1[0] @ w5 * " - "inr12\n" - "vmla.f32 q9, q7, d2[0] @ w5 * " - "inr14\n" - "vmla.f32 q10, q7, d3[0] @ w5 * " - "inr16\n" - "vmla.f32 q11, q7, d8[0] @ w5 * " - "inr18\n" - - "sub %[ptr_out1], %[ptr_out1], #32 @ ptr_out1 " - "- 32, to start address\n" - - /* mul r3, with w3, w4, w5 */ - "vld1.32 {d0-d3}, [%[r2]]! @ load r2, " - "8 float\n" - "vmla.f32 q12, q5, d4[0] @ w3 * " - "inr30\n" - "vmla.f32 q13, q5, d5[0] @ w3 * " - "inr32\n" - "vmla.f32 q14, q5, d6[0] @ w3 * " - "inr34\n" - "vmla.f32 q15, q5, d7[0] @ w3 * " - "inr36\n" - "vld1.32 {d8}, [%[r2]] @ load r2, " - "9th float\n" - "vmla.f32 q12, q6, d4[1] @ w4 * " - "inr31\n" - "vmla.f32 q13, q6, d5[1] @ w4 * " - "inr33\n" - "vmla.f32 q14, q6, d6[1] @ w4 * " - "inr35\n" - "vmla.f32 q15, q6, d7[1] @ w4 * " - "inr37\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w6, " - "w7, to q5, q6\n" - "vmla.f32 q12, q7, d5[0] @ w5 * " - "inr32\n" - "vmla.f32 q13, q7, d6[0] @ w5 * " - "inr34\n" - "vmla.f32 q14, q7, d7[0] @ w5 * " - "inr36\n" - "vmla.f32 q15, q7, d9[0] @ w5 * " - "inr38\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w8, " - "to q7\n" - - /* mul r2, with w6, w7, w8 */ - "vmla.f32 q8, q5, d0[0] @ w6 * " - "inr20\n" - "vmla.f32 q9, q5, d1[0] @ w6 * " - "inr22\n" - "vmla.f32 q10, q5, d2[0] @ w6 * " - "inr24\n" - "vmla.f32 q11, q5, d3[0] @ w6 * " - "inr26\n" - "vld1.32 {d4-d7}, [%[r4]]! @ load r4, " - "8 float\n" - "vmla.f32 q8, q6, d0[1] @ w7 * " - "inr21\n" - "vmla.f32 q9, q6, d1[1] @ w7 * " - "inr23\n" - "vmla.f32 q10, q6, d2[1] @ w7 * " - "inr25\n" - "vmla.f32 q11, q6, d3[1] @ w7 * " - "inr27\n" - "vld1.32 {d9}, [%[r4]] @ load r4, " - "9th float\n" - "vmla.f32 q8, q7, d1[0] @ w8 * " - "inr22\n" - "vmla.f32 q9, q7, d2[0] @ w8 * " - "inr24\n" - "vmla.f32 q10, q7, d3[0] @ w8 * " - "inr26\n" - "vmla.f32 q11, q7, d8[0] @ w8 * " - "inr28\n" - - "sub %[wc0], %[wc0], #144 @ wc0 - " - "144 to start address\n" - - /* mul r4, with w6, w7, w8 */ - "vld1.32 {d0-d3}, [%[r0]]! @ load r0, " - "8 float\n" - "vmla.f32 q12, q5, d4[0] @ w3 * " - "inr40\n" - "vst1.32 {d16-d19}, [%[ptr_out0]]! @ save " - "r00, r01, c0~c3\n" - "vmla.f32 q13, q5, d5[0] @ w3 * " - "inr42\n" - "vst1.32 {d20-d23}, [%[ptr_out0]]! @ save " - "r02, r03, c0~c3\n" - "vmla.f32 q14, q5, d6[0] @ w3 * " - "inr44\n" - "vmla.f32 q15, q5, d7[0] @ w3 * " - "inr46\n" - "vld1.32 {d8}, [%[r0]] @ load r0, " - "9th float\n" - "vmla.f32 q12, q6, d4[1] @ w4 * " - "inr41\n" - "vmla.f32 q13, q6, d5[1] @ w4 * " - "inr43\n" - "vmla.f32 q14, q6, d6[1] @ w4 * " - "inr45\n" - "vmla.f32 q15, q6, d7[1] @ w4 * " - "inr47\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, " - "w1, to q5, q6\n" - "vmla.f32 q12, q7, d5[0] @ w5 * " - "inr42\n" - "vmla.f32 q13, q7, d6[0] @ w5 * " - "inr44\n" - "vmla.f32 q14, q7, d7[0] @ w5 * " - "inr46\n" - "vmla.f32 q15, q7, d9[0] @ w5 * " - "inr48\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, " - "to q7\n" - - "vst1.32 {d24-d27}, [%[ptr_out1]]! @ save " - "r10, r11, c0~c3\n" - "vst1.32 {d28-d31}, [%[ptr_out1]]! @ save " - "r12, r13, c0~c3\n" - - "vld1.32 {d16-d19}, [%[ptr_out0]]! @ load " - "outr0, w0, w1, c0~c3\n" - "vld1.32 {d20-d23}, [%[ptr_out0]] @ load " - "outr0, w2, w3, c0~c3\n" - - "sub %[ptr_out0], %[ptr_out0], #32 @ ptr_out0 " - "- 32, to start address\n" - - "subs %[cnt], #1 @ loop " - "count--\n" - "bne 0b @ jump to " - "main loop\n" - - : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), - [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1), [wc0] "+r"(wc0) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); - - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } -#endif // __aarch64__ - block_inr0 = block_inr4; - block_inr1 = block_inr0 + in_len; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - block_inr4 = block_inr3 + in_len; - } - - slidingwindow_writeout_c4_fp32(pre_out, dout_batch, c, c + hout_c_block, - h, h + h_kernel, 0, wout_round, chout, - hout, wout, relu, ptr_write); - } - -#pragma omp parallel for - for (int c = 0; c < c_remain; ++c) { -#ifdef USE_OPENMP - float *pre_out = - pre_din + pre_in_size + omp_get_thread_num() * pre_out_size; -#else - float *pre_out = pre_din + pre_in_size; -#endif - - const float *block_inr0 = cblock_inr0; - const float *block_inr1 = cblock_inr1; - const float *block_inr2 = cblock_inr2; - const float *block_inr3 = cblock_inr3; - const float *block_inr4 = cblock_inr4; - - //! get weights ptr of remained - const float *weight_c = weights + c_round_down * w_stride; - - //! fill bias to one channel - const float *bias_ptr = ptr_zero; - if (bias != nullptr) { - bias_ptr = bias + c_round_down + c; - } - slidingwindow_fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel); - - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const float *wc0 = weight_c; - - const float *inr0 = block_inr0; - const float *inr1 = block_inr1; - const float *inr2 = block_inr2; - const float *inr3 = block_inr3; - const float *inr4 = block_inr4; - - float *pre_out0 = pre_out + hk * wout_round; - float *pre_out1 = pre_out0 + wout_round; -#ifdef __aarch64__ - for (int i = 0; i < chin; ++i) { - float *ptr_out0 = pre_out0; - float *ptr_out1 = pre_out1; - - //! get valid weights of current output channel - float32x4_t w0 = vdupq_n_f32(wc0[c]); // w0, v23 - float32x4_t w1 = vdupq_n_f32(wc0[c + 4]); // w1, v24 - float32x4_t w2 = vdupq_n_f32(wc0[c + 8]); // w2, v25 - float32x4_t w3 = vdupq_n_f32(wc0[c + 12]); // w3, v26 - float32x4_t w4 = vdupq_n_f32(wc0[c + 16]); // w4, v27 - float32x4_t w5 = vdupq_n_f32(wc0[c + 20]); // w5, v28 - float32x4_t w6 = vdupq_n_f32(wc0[c + 24]); // w6, v29 - float32x4_t w7 = vdupq_n_f32(wc0[c + 28]); // w7, v30 - float32x4_t w8 = vdupq_n_f32(wc0[c + 32]); // w8, v31 - - const float *r0 = inr0; - const float *r1 = inr1; - const float *r2 = inr2; - const float *r3 = inr3; - const float *r4 = inr4; - - int cnt = w_loop; - asm volatile( - "ldr q21, [%[ptr_out0]] \n" /* load outr00, outr01, - outr02, outr03*/ - - "ld2 {v0.4s, v1.4s}, [%[r0]], #32 \n" /* load input r0*/ - "ldr d10, [%[r0]] \n" /* load input r0, 9th - element*/ - "ld2 {v4.4s, v5.4s}, [%[r2]], #32 \n" /* load input r2*/ - "ldr d12, [%[r2]] \n" /* load input r2, 9th - element*/ - "2: \n" /* main loop*/ - /* r0, r2, mul w0, get out r0, r1 */ - "ldr q22, [%[ptr_out1]] \n" /* load outr10, outr11, - outr12, outr13*/ - - "fmla v21.4s , %[w0].4s, v0.4s \n" /* outr0 = w0 * r0[0, 2, - 4, 6]*/ - "fmla v22.4s , %[w0].4s, v4.4s \n" /* outr1 = w0 * r2[0, 2, - 4, 6]*/ - - "ld2 {v2.4s, v3.4s}, [%[r1]], #32 \n" /* load input r1*/ - - /* r2 mul w6, get out r0*/ - "fmla v21.4s , %[w6].4s, v4.4s \n" /* outr0 = w6 * r2[0, 2, - 4, 6]*/ - "ldr d11, [%[r1]] \n" /* load input r1, 9th - element*/ - - /* shift left 1 */ - "ext v15.16b, v0.16b, v10.16b, #4\n" /* shift left r0 1*/ - "ext v16.16b, v4.16b, v12.16b, #4\n" /* shift left r2 1*/ - - /* r0, r2, mul w1, get out r0, r1 */ - "fmla v21.4s , %[w1].4s, v1.4s \n" /* outr0 = w1 * r0[1, 3, - 5, 7]*/ - "fmla v22.4s , %[w1].4s, v5.4s \n" /* outr1 = w1 * r2[1, 3, - 5, 7]*/ - - "ld2 {v6.4s, v7.4s}, [%[r3]], #32 \n" /* load input r3*/ - - /* r2 mul w7, get out r0 */ - "fmla v21.4s , %[w7].4s, v5.4s \n" /* outr00 = w7 * r2[1, - 3, 5, 7]*/ - - "ldr d13, [%[r3]] \n" /* load input r3, 9th - element*/ - - /* r0, r2, mul w2, get out r0, r1 */ - "fmla v21.4s , %[w2].4s, v15.4s \n" /* outr0 = w2 * r0[2, 4, - 6, 8]*/ - "fmla v22.4s , %[w2].4s, v16.4s \n" /* outr1 = w2 * r2[2, 4, - 6, 8]*/ - - "ld2 {v8.4s, v9.4s}, [%[r4]], #32 \n" /* load input r4*/ - - /* r2, mul w8, get out r0 */ - "fmla v21.4s , %[w8].4s, v16.4s \n" /* outr00 = w8 * r2[2, - 4, 6, 8]*/ - - "ldr d14, [%[r4]] \n" /* load input r4, 9th - element*/ - - /* r1, r3, mul w3, get out r0, r1 */ - "fmla v21.4s , %[w3].4s, v2.4s \n" /* outr0 = w3 * r1[0, 2, - 4, 6]*/ - "fmla v22.4s , %[w3].4s, v6.4s \n" /* outr1 = w3 * r3[0, 2, - 4, 6]*/ - - /* shift left 1 */ - "ext v15.16b, v2.16b, v11.16b, #4\n" /* shift left r1 1*/ - "ext v16.16b, v6.16b, v13.16b, #4\n" /* shift left r3 1*/ - - "ld2 {v0.4s, v1.4s}, [%[r0]], #32 \n" /* load input r0*/ - - /* r1, r3, mul w4, get out r0, r1 */ - "fmla v21.4s , %[w4].4s, v3.4s \n" /* outr0 = w4 * r1[1, 3, - 5, 7]*/ - "fmla v22.4s , %[w4].4s, v7.4s \n" /* outr1 = w4 * r3[1, 3, - 5, 7]*/ - - "ldr d10, [%[r0]] \n" /* load input r0, 9th - element*/ - - /* r1, r3, mul w5, get out r0, r1 */ - "fmla v21.4s , %[w5].4s, v15.4s \n" /* outr0 = w5 * r1[2]*/ - "fmla v22.4s , %[w5].4s, v16.4s \n" /* outr1 = w5 * r1[4]*/ - - "ld2 {v4.4s, v5.4s}, [%[r2]], #32 \n" /* load input r2*/ - "ldr d12, [%[r2]] \n" /* load input r2, 9th - element*/ - "str q21, [%[ptr_out0]], #16 \n" /* save outr00, outr01*/ - - /* r4, mul w6, get out r1 */ - "fmla v22.4s , %[w6].4s, v8.4s \n" /* outr1 = w6 * r4[0, 2, - 4, 6]*/ - - "ext v15.16b, v8.16b, v14.16b, #4\n" /* shift left r1 1*/ - "ldr q21, [%[ptr_out0]] \n" /* load outr0*/ - - /* r4, mul w7, get out r1 */ - "fmla v22.4s , %[w7].4s, v9.4s \n" /* outr1 = w7 * r4[1, 3, - 5, 7]*/ - - /* r4, mul w8, get out r1 */ - "fmla v22.4s , %[w8].4s, v15.4s \n" /* outr1 = w8 * r4[2, 4, - 6, 8]*/ - - "subs %w[cnt], %w[cnt], #1 \n" /*loop count -1*/ - "str q22, [%[ptr_out1]], #16 \n" /* save outr1*/ - "bne 2b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), - [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3), - [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7), - [w8] "w"(w8) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v16", "v21", "v22"); - - wc0 += 36; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } -#else // not __aarch64__ - for (int i = 0; i < chin; ++i) { - float *ptr_out0 = pre_out0; - float *ptr_out1 = pre_out1; - - //! get valid weights of current output channel - float w_tmp[12] = {wc0[c], wc0[c + 4], wc0[c + 8], 0.f, - wc0[c + 12], wc0[c + 16], wc0[c + 20], 0.f, - wc0[c + 24], wc0[c + 28], wc0[c + 32], 0.f}; - float32x4_t w0 = vld1q_f32(w_tmp); // w0, w1, w2, q0 - float32x4_t w1 = vld1q_f32(w_tmp + 4); // w3, w4, w5, q1 - float32x4_t w2 = vld1q_f32(w_tmp + 8); // w6, w7, w8, q2 - - const float *r0 = inr0; - const float *r1 = inr1; - const float *r2 = inr2; - const float *r3 = inr3; - const float *r4 = inr4; - - int cnt = w_loop / 2; - if (cnt > 0) { - asm volatile( - /* main loop */ - "0: @ " - "main loop\n" - "vld1.32 {d24-d27}, [%[ptr_out0]] @ load or00, " - "or01\n" - "vld1.32 {d28-d31}, [%[ptr_out1]] @ load or10, " - "or11\n" - "vld2.32 {d6-d9}, [%[r2]]! @ load r2, 8 " - "float, interleave\n" - "vld2.32 {d10-d13}, [%[r2]]! @ load r2, 8 " - "float, interleave\n" - "vld1.32 {d22}, [%[r2]] @ load 16th " - "float\n" - - /* r2 * w2, r2 * w0, get or0, or1 */ - "vmla.f32 q12, q4, %e[w2][1] @ w21 * r2, " - "1, 3, 5, 7\n" - "vmla.f32 q13, q6, %e[w2][1] @ w21 * r2, " - "9, 11, 13, 15\n" - "vld2.32 {d14-d17}, [%[r0]]! @ load r0, 8 " - "float, interleave\n" - "vmla.f32 q14, q4, %e[w0][1] @ w01 * r2, " - "1, 3, 5, 7\n" - "vmla.f32 q15, q6, %e[w0][1] @ w01 * r2, " - "9, 11, 13, 15\n" - - "vext.32 q4, q3, q5, #1 @ r2, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q6, q5, q11, #1 @ r2, shift " - "left 1, get 10, 12, 14, 16\n" - - "vmla.f32 q12, q3, %e[w2][0] @ w20 * r2, " - "0, 2, 4, 6\n" - "vmla.f32 q13, q5, %e[w2][0] @ w20 * r2, " - "8, 10, 12, 14\n" - "vld2.32 {d18-d21}, [%[r0]]! @ load r0, 8 " - "float, interleave\n" - "vmla.f32 q14, q3, %e[w0][0] @ w00 * r2, " - "0, 2, 4, 6\n" - "vmla.f32 q15, q5, %e[w0][0] @ w00 * r2, " - "8, 10, 12, 14\n" - - "vld1.32 {d22}, [%[r0]] @ load 16th " - "float\n" - - "vmla.f32 q12, q4, %f[w2][0] @ w22 * r2, " - "2, 4, 6, 8\n" - "vmla.f32 q14, q4, %f[w0][0] @ w02 * r2, " - "2, 4, 6, 8\n" - "vld2.32 {d6-d9}, [%[r3]]! @ load r3, 8 " - "float, interleave\n" - "vmla.f32 q13, q6, %f[w2][0] @ w22 * r2, " - "10, 12, 14, 16\n" - "vmla.f32 q15, q6, %f[w0][0] @ w02 * r2, " - "10, 12, 14, 16\n" - "vld2.32 {d10-d13}, [%[r3]]! @ load r3, 8 " - "float, interleave\n" - - /* r0 * w0, get or0, r3 * w1, get or1*/ - "vmla.f32 q12, q8, %e[w0][1] @ w01 * r0, " - "1, 3, 5, 7\n" - "vmla.f32 q13, q10, %e[w0][1] @ w01 * r0, " - "9, 11, 13, 15\n" - "vext.32 q8, q7, q9, #1 @ r0, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q10, q9, q11, #1 @ r0, shift " - "left 1, get 10, 12, 14, 16\n" - "vld1.32 {d22}, [%[r3]] @ load 16th " - "float\n" - "vmla.f32 q14, q4, %e[w1][1] @ w11 * r3, " - "1, 3, 5, 7\n" - "vmla.f32 q15, q6, %e[w1][1] @ w11 * r3, " - "9, 11, 13, 15\n" - - "vmla.f32 q12, q7, %e[w0][0] @ w00 * r0, " - "0, 2, 4, 6\n" - "vmla.f32 q13, q9, %e[w0][0] @ w00 * r0, " - "8, 10, 12, 14\n" - "vext.32 q4, q3, q5, #1 @ r3, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q6, q5, q11, #1 @ r3, shift " - "left 1, get 10, 12, 14, 16\n" - "vmla.f32 q14, q3, %e[w1][0] @ w10 * r3, " - "0, 2, 4, 6\n" - "vmla.f32 q15, q5, %e[w1][0] @ w10 * r3, " - "8, 10, 12, 14\n" - - "vmla.f32 q12, q8, %f[w0][0] @ w02 * r0, " - "2, 4, 6, 8\n" - "vld2.32 {d14-d17}, [%[r1]]! @ load r1, 8 " - "float, interleave\n" - "vmla.f32 q13, q10,%f[w0][0] @ w02 * r0, " - "10, 12, 14, 16\n" - "vld2.32 {d18-d21}, [%[r1]]! @ load r1, 8 " - "float, interleave\n" - "vmla.f32 q14, q4, %f[w1][0] @ w12 * r3, " - "2, 4, 6, 8\n" - "vld2.32 {d6-d9}, [%[r4]]! @ load r4, 8 " - "float, interleave\n" - "vmla.f32 q15, q6, %f[w1][0] @ w12 * r3, " - "10, 12, 14, 16\n" - "vld2.32 {d10-d13}, [%[r4]]! @ load r4, 8 " - "float, interleave\n" - - "vld1.32 {d22}, [%[r1]] @ load 16th " - "float\n" - - /* r1 * w1, get or0, r4 * w2, get or1 */ - "vmla.f32 q12, q8, %e[w1][1] @ w11 * r1, " - "1, 3, 5, 7\n" - "vmla.f32 q13, q10, %e[w1][1] @ w11 * r1, " - "9, 11, 13, 15\n" - "vext.32 q8, q7, q9, #1 @ r1, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q10, q9, q11, #1 @ r1, shift " - "left 1, get 10, 12, 14, 16\n" - "vmla.f32 q14, q4, %e[w2][1] @ w21 * r4, " - "1, 3, 5, 7\n" - "vmla.f32 q15, q6, %e[w2][1] @ w21 * r4, " - "9, 11, 13, 15\n" - "vld1.32 {d22}, [%[r4]] @ load 16th " - "float\n" - - "vmla.f32 q12, q7, %e[w1][0] @ w10 * r1, " - "0, 2, 4, 6\n" - "vmla.f32 q13, q9, %e[w1][0] @ w10 * r1, " - "8, 10, 12, 14\n" - "vext.32 q4, q3, q5, #1 @ r1, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q6, q5, q11, #1 @ r1, shift " - "left 1, get 10, 12, 14, 16\n" - "vmla.f32 q14, q3, %e[w2][0] @ w20 * r4, " - "0, 2, 4, 6\n" - "vmla.f32 q15, q5, %e[w2][0] @ w20 * r4, " - "8, 10, 12, 14\n" - - "vmla.f32 q12, q8, %f[w1][0] @ w12 * r1, " - "2, 4, 6, 8\n" - "vmla.f32 q13, q10, %f[w1][0] @ w12 * r1, " - "10, 12, 14, 16\n" - "vmla.f32 q14, q4, %f[w2][0] @ w22 * r4, " - "2, 4, 6, 8\n" - "vmla.f32 q15, q6, %f[w2][0] @ w22 * r4, " - "10, 12, 14, 16\n" - - "vst1.32 {d24-d27}, [%[ptr_out0]]! @ save or0\n" - "vst1.32 {d28-d31}, [%[ptr_out1]]! @ save or0\n" - - "subs %[cnt], #1 @loop count " - "-1\n" - "bne 0b @ jump to " - "main loop\n" - - : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), - [r2] "+r"(r2), [r3] "+r"(r3), [r4] "+r"(r4), - [ptr_out0] "+r"(ptr_out0), [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2) - : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); - } - //! deal with remain wout - if (w_loop & 1) { - ptr_out0[0] += - r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] + - r1[0] * w_tmp[4] + r1[1] * w_tmp[5] + r1[2] * w_tmp[6] + - r2[0] * w_tmp[8] + r2[1] * w_tmp[9] + r2[2] * w_tmp[10]; - - ptr_out0[1] += - r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] + - r1[2] * w_tmp[4] + r1[3] * w_tmp[5] + r1[4] * w_tmp[6] + - r2[2] * w_tmp[8] + r2[3] * w_tmp[9] + r2[4] * w_tmp[10]; - - ptr_out0[2] += - r0[4] * w_tmp[0] + r0[5] * w_tmp[1] + r0[6] * w_tmp[2] + - r1[4] * w_tmp[4] + r1[5] * w_tmp[5] + r1[6] * w_tmp[6] + - r2[4] * w_tmp[8] + r2[5] * w_tmp[9] + r2[6] * w_tmp[10]; - - ptr_out0[3] += - r0[6] * w_tmp[0] + r0[7] * w_tmp[1] + r0[8] * w_tmp[2] + - r1[6] * w_tmp[4] + r1[7] * w_tmp[5] + r1[8] * w_tmp[6] + - r2[6] * w_tmp[8] + r2[7] * w_tmp[9] + r2[8] * w_tmp[10]; - - ptr_out1[0] += - r2[0] * w_tmp[0] + r2[1] * w_tmp[1] + r2[2] * w_tmp[2] + - r3[0] * w_tmp[4] + r3[1] * w_tmp[5] + r3[2] * w_tmp[6] + - r4[0] * w_tmp[8] + r4[1] * w_tmp[9] + r4[2] * w_tmp[10]; - - ptr_out1[1] += - r2[2] * w_tmp[0] + r2[3] * w_tmp[1] + r2[4] * w_tmp[2] + - r3[2] * w_tmp[4] + r3[3] * w_tmp[5] + r3[4] * w_tmp[6] + - r4[2] * w_tmp[8] + r4[3] * w_tmp[9] + r4[4] * w_tmp[10]; - - ptr_out1[2] += - r2[4] * w_tmp[0] + r2[5] * w_tmp[1] + r2[6] * w_tmp[2] + - r3[4] * w_tmp[4] + r3[5] * w_tmp[5] + r3[6] * w_tmp[6] + - r4[4] * w_tmp[8] + r4[5] * w_tmp[9] + r4[6] * w_tmp[10]; - - ptr_out1[3] += - r2[6] * w_tmp[0] + r2[7] * w_tmp[1] + r2[8] * w_tmp[2] + - r3[6] * w_tmp[4] + r3[7] * w_tmp[5] + r3[8] * w_tmp[6] + - r4[6] * w_tmp[8] + r4[7] * w_tmp[9] + r4[8] * w_tmp[10]; - } - - wc0 += 36; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } -#endif // __aarch64__ - block_inr0 = block_inr4; - block_inr1 = block_inr0 + in_len; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - block_inr4 = block_inr3 + in_len; - } - slidingwindow_writeout_c1_fp32( - pre_out, dout_batch, c + c_round_down, c + c_round_down + 1, h, - h + h_kernel, 0, wout_round, chout, hout, wout, relu, ptr_write); - } - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/slidingwindow_conv3x3.h b/mobile/src/operators/math/slidingwindow_conv3x3.h deleted file mode 100644 index 8bdd682cdb..0000000000 --- a/mobile/src/operators/math/slidingwindow_conv3x3.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { -template -void SlidingwindowConv3x3s1(const framework::Tensor *input, - const framework::Tensor *filter, - const std::vector &paddings, - framework::Tensor *output); - -template -void SlidingwindowConv3x3s2(const framework::Tensor *input, - const framework::Tensor *filter, - const std::vector &paddings, - framework::Tensor *output); - -template -void SlidingwindowConv3x3s1Faster(const framework::Tensor *input, - framework::Tensor *filter, - const std::vector &paddings, - framework::Tensor *output, const float *bias, - bool is_bias, bool is_relu); - -template -void SlidingwindowConv3x3s2Faster(const framework::Tensor *input, - framework::Tensor *filter, - const std::vector &paddings, - framework::Tensor *output, const float *bias, - bool is_bias, bool is_relu); -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/slidingwindow_utils.cpp b/mobile/src/operators/math/slidingwindow_utils.cpp deleted file mode 100644 index cd20612482..0000000000 --- a/mobile/src/operators/math/slidingwindow_utils.cpp +++ /dev/null @@ -1,365 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/math/slidingwindow_utils.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -void slidingwindow_fill_bias(float* dout, const float* bias, int ch_num, - int ch_size) { - for (int j = 0; j < ch_num; j++) { - float32x4_t vb = vdupq_n_f32(bias[j]); - int i = 0; - for (; i < ch_size - 3; i += 4) { - vst1q_f32(dout + i, vb); - } - for (; i < ch_size; i++) { - dout[i] = bias[j]; - } - dout += ch_size; - } -} - -/* write result in outputs - * input din: [n, c, h, w], output dout: [n, c, h, w] - */ -void slidingwindow_writeout_c1_fp32(const float* din, float* dout, int cs, - int ce, int hs, int he, int ws, int we, - int channel, int height, int width, - bool flag_relu, float* trash_ptr) { - if (cs > channel) { - return; - } - - const int c1 = 1; - const int w4 = 4; - - int size_c_out = width * height; - - float* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - - const float* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int w_round = we - ws; - int cnt = (width - ws) / w4; - - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - float* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - const float* din_hei_ptr = ptr_din + i * w_round * c1; - if (cnt > 0) { - int cnt_loop = cnt; - if (flag_relu) { -#ifdef __aarch64__ - asm volatile( - "ldr q0, [%[ptr_din]], #16 \n" /* load data, c0r0, c0r1, c0r2, - c0r3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop */ - "fmax v1.4s, v0.4s, v20.4s \n" /* relu */ - "ldr q0, [%[ptr_din]], #16 \n" /* load data, c0r0, c0r1, c0r2, - c0r3 */ - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1 */ - "str q1, [%[doutc0r0]], #16 \n" /* store c0r0 */ - "bne 1b \n" /* jump to main loop */ - : [doutc0r0] "+r"(doutc0_ptr), [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v20"); -#else - asm volatile( - "vld1.32 {d0-d1}, [%[ptr_din]]! @ load data, c0r0, c1r0, " - "c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - - "vmax.f32 q1, q0, q15 @ relu\n" - "vld1.32 {d0-d1}, [%[ptr_din]]! @ load data \n" - - "vst1.32 {d2-d3}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q15"); -#endif - } else { -#ifdef __aarch64__ - asm volatile( - "ldr q0, [%[ptr_din]], #16 \n" /* load data, c0r0, c0r1, c0r2, - c0r3 */ - "1: \n" /* main loop */ - "str q0, [%[doutc0r0]], #16 \n" /* store c2r0 */ - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1 */ - "ldr q0, [%[ptr_din]], #16 \n" /* load data, c0r0, c0r1, c0r2, - c0r3 */ - "bne 1b \n" /* jump to main loop */ - - : [doutc0r0] "+r"(doutc0_ptr), [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0"); -#else - asm volatile( - "vld1.32 {d0-d1}, [%[ptr_din]]! @ load data, c0r0, c0r1, " - "c0r2, c0r3\n" - "1: @ main loop\n" - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - "vld1.32 {d0-d1}, [%[ptr_din]]! @ load data \n" - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0"); -#endif - } - } - if (we > width) { - int offset = i * w_round * c1 + c1 * w4 * cnt; - din_hei_ptr = ptr_din + offset; - int j = we - w4; - if (flag_relu) { - for (; j < width; ++j) { - *(doutc0_ptr++) = std::max(din_hei_ptr[0], 0.f); - din_hei_ptr++; - } - } else { - for (; j < width; ++j) { - *(doutc0_ptr++) = *(din_hei_ptr++); - } - } - } - } -} - -/* write result in outputs - * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] - */ -void slidingwindow_writeout_c4_fp32(const float* din, float* dout, int cs, - int ce, int hs, int he, int ws, int we, - int channel, int height, int width, - bool flag_relu, float* trash_ptr) { - const int c4 = 4; - const int w4 = 4; - const int w_round = we - ws; - const int ch_n = ce - cs; - int size_c_out = width * height; - - float* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - float* doutc1r0 = doutc0r0 + size_c_out; - float* doutc2r0 = doutc1r0 + size_c_out; - float* doutc3r0 = doutc2r0 + size_c_out; - - const float* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int cnt = (width - ws) / w4; - - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - float* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - float* doutc1_ptr = doutc1r0 + size_w; - float* doutc2_ptr = doutc2r0 + size_w; - float* doutc3_ptr = doutc3r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 3: - doutc1_ptr = trash_ptr; - case 2: - doutc2_ptr = trash_ptr; - case 1: - doutc3_ptr = trash_ptr; - default: - break; - } - } - const float* din_hei_ptr = ptr_din + i * w_round * ch_n; - if (cnt > 0) { - int cnt_loop = cnt; - if (flag_relu) { -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop */ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1 */ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1 */ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3 */ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10 */ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10 */ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11 */ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11 */ - "fmax v16.4s, v16.4s, v20.4s \n" /* relu */ - "fmax v17.4s, v17.4s, v20.4s \n" /* relu */ - "fmax v18.4s, v18.4s, v20.4s \n" /* relu */ - "fmax v19.4s, v19.4s, v20.4s \n" /* relu */ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0 */ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0 */ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0 */ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0 */ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1 */ - "bne 1b \n" /* jump to main loop */ - - : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @ load data \n" - "vmov.u32 q15, #0 @ dump zero \n" - "1: @ main loop \n" - "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " - "\n" - "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " - "\n" - - "vswp d1, d4 @ swap data\n" - "vswp d3, d6 @ swap data\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - "vmax.f32 q2, q2, q15 @ relu\n" - "vmax.f32 q3, q3, q15 @ relu\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @ load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); -#endif - } else { -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop */ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1 */ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1 */ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3 */ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10 */ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10 */ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11 */ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11 */ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0 */ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0 */ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0 */ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0 */ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1 */ - "bne 1b \n" /* jump to main loop */ - - : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", - "v18", "v19"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @ load data \n" - "1: @ main loop \n" - "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " - "\n" - "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " - "\n" - - "vswp d1, d4 @ swap data\n" - "vswp d3, d6 @ swap data\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @ load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3"); -#endif - } - } - if (we > width) { - int offset = i * w_round * c4 + c4 * w4 * cnt; - din_hei_ptr = ptr_din + offset; - int j = we - w4; - if (flag_relu) { - for (; j < width; ++j) { - *(doutc0_ptr++) = std::max(din_hei_ptr[0], 0.f); - *(doutc1_ptr++) = std::max(din_hei_ptr[1], 0.f); - *(doutc2_ptr++) = std::max(din_hei_ptr[2], 0.f); - *(doutc3_ptr++) = std::max(din_hei_ptr[3], 0.f); - din_hei_ptr += w4; - } - } else { - for (; j < width; ++j) { - *(doutc0_ptr++) = din_hei_ptr[0]; - *(doutc1_ptr++) = din_hei_ptr[1]; - *(doutc2_ptr++) = din_hei_ptr[2]; - *(doutc3_ptr++) = din_hei_ptr[3]; - din_hei_ptr += w4; - } - } - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/slidingwindow_utils.h b/mobile/src/operators/math/slidingwindow_utils.h deleted file mode 100644 index 6db22bcf5f..0000000000 --- a/mobile/src/operators/math/slidingwindow_utils.h +++ /dev/null @@ -1,159 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/tensor.h" - -#if __ARM_NEON -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { - -/* preprocessing weights - * input weights: [chout, chin/ group, kh, kw] --> outputs weights: [chout / n, - * chin/ group, kh, kw, n] - */ -template -void slidingwindow_transform_weight(const framework::Tensor& weight, - framework::Tensor* output) { - int chout = weight.dims()[0]; - int chin = weight.dims()[1]; - int kernel_size = weight.dims()[2] * weight.dims()[3]; - const int n = 4; - int cround = (chout + n - 1) / n * n; - const dtype* din = weight.data(); - dtype* dout = output->mutable_data({cround, chin, 3, 3}); - int c_loop = chout / n; - int chout_round = (chout + n - 1) / n; - int win_stride = chin * kernel_size; - int wout_stride = n * win_stride; - int co = 0; - for (; co < c_loop; ++co) { - dtype* dout_c = dout + co * wout_stride; - const dtype* din_array[n]; - din_array[0] = din + co * wout_stride; - for (int i = 1; i < n; i++) { - din_array[i] = din_array[i - 1] + win_stride; - } - for (int ci = 0; ci < chin; ++ci) { - for (int k = 0; k < kernel_size; ++k) { - for (int i = 0; i < n; i++) { - *(dout_c++) = *(din_array[i]++); - } - } - } - } - // pad final chout - if (chout_round > c_loop) { - dtype* dout_c = dout + c_loop * wout_stride; - const dtype* din_array[n]; - din_array[0] = din + c_loop * wout_stride; - for (int i = 1; i < n; i++) { - din_array[i] = din_array[i - 1] + win_stride; - } - // deal remain - int cremain = chout_round * n - chout; - for (int i = 1; i <= cremain; i++) { - din_array[n - i] = din_array[0]; - } - for (int ci = 0; ci < chin; ++ci) { - for (int k = 0; k < kernel_size; ++k) { - for (int i = 0; i < n; i++) { - *(dout_c++) = *(din_array[i]++); - } - } - } - } -} - -/* preprocessing inputs - * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws] - * n = he - hs - */ -template -void slidingwindow_prepack_input(const dtype* din, dtype* dout, int cs, int ce, - int hs, int he, int ws, int we, int channel, - int width, int height, dtype* zero_ptr) { - int n = he - hs; - int w0 = ws < 0 ? 0 : ws; - int w1 = we > width ? width : we; - - int size_w = we - ws; - int size_wc_len = size_w * channel; - int size_c = width * height; - - int valid_w = w1 - w0; - size_t valid_w_byte = valid_w * sizeof(dtype); - - dtype* out_array[n]; - out_array[0] = dout; - for (int i = 1; i < n; i++) { - out_array[i] = out_array[i - 1] + size_wc_len; - } - - for (int c = 0; c < channel; ++c) { - int j = 0; - // valid height - for (int i = hs; i < he; i++) { - // get address - const dtype* in_array; - if (i < 0 || i >= height) { - in_array = zero_ptr; - } else { - in_array = din + i * width; - } - - for (int w = ws; w < w0; ++w) { - *(out_array[j]++) = 0.f; - } - memcpy(out_array[j], in_array, valid_w_byte); - out_array[j] += valid_w; - for (int w = w1; w < we; ++w) { - *(out_array[j]++) = 0.f; - } - j++; - } - din += size_c; - } -} - -inline void slidingwindow_fill_bias(float* dout, const float* bias, int size) { - float32x4_t vb = vld1q_f32(bias); - int cnt = size / 4; - for (int i = 0; i < cnt; ++i) { - vst1q_f32(dout, vb); - dout += 4; - } -} - -void slidingwindow_fill_bias(float* dout, const float* bias, int ch_num, - int ch_size); - -void slidingwindow_writeout_c1_fp32(const float* din, float* dout, int cs, - int ce, int hs, int he, int ws, int we, - int channel, int height, int width, - bool flag_relu, float* trash_ptr); - -void slidingwindow_writeout_c4_fp32(const float* din, float* dout, int cs, - int ce, int hs, int he, int ws, int we, - int channel, int height, int width, - bool flag_relu, float* trash_ptr); -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/softmax.cpp b/mobile/src/operators/math/softmax.cpp deleted file mode 100644 index e066b0cccd..0000000000 --- a/mobile/src/operators/math/softmax.cpp +++ /dev/null @@ -1,157 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#include "operators/math/softmax.h" -#include -#include -#include -#include "common/types.h" -#include "operators/math/math.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#ifndef __aarch64__ -inline float32_t vmaxvq_f32(const float32x4_t &r) { - float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r)); - return vget_lane_f32(vpmax_f32(v, v), 0); -} - -inline float32_t vaddvq_f32(const float32x4_t &r) { - float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r)); - return vget_lane_f32(vpadd_f32(v, v), 0); -} -#endif // __aarch64__ -#endif // __ARM_NEON__ - -float find_max(const float *input, const int num_classes) { - int remain = num_classes; - float max = -std::numeric_limits::max(); -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - int loop = num_classes >> 3; - remain = num_classes & 0x7; - float32x4_t __max = vdupq_n_f32(max); - for (int i = 0; i < loop; ++i, input += 8) { - float32x4_t x0 = vld1q_f32(input); - float32x4_t x1 = vld1q_f32(input + 4); - __max = vmaxq_f32(x0, __max); - __max = vmaxq_f32(x1, __max); - } - max = vmaxvq_f32(__max); -#endif - for (int i = 0; i < remain; ++i) { - max = std::max(max, input[i]); - } - return max; -} - -void SoftmaxBasic(const float *input, int num_classes, float *y) { - float *output = y; - // find max - float max = find_max(input, num_classes); - - // exp(x - max) and sum(exp(x - max)) - int remain = num_classes; - float sum = 0.f; -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - int loop = num_classes >> 3; - remain = num_classes & 0x7; - float32x4_t __max = vdupq_n_f32(max); - float32x4_t __sum = vdupq_n_f32(0.f); - for (int i = 0; i < loop; ++i, input += 8, output += 8) { - float32x4_t x0 = vld1q_f32(input); - float32x4_t x1 = vld1q_f32(input + 4); - x0 = vsubq_f32(x0, __max); - x1 = vsubq_f32(x1, __max); - x0 = exp_ps(x0); - x1 = exp_ps(x1); - __sum = vaddq_f32(x0, __sum); - __sum = vaddq_f32(x1, __sum); - vst1q_f32(output, x0); - vst1q_f32(output + 4, x1); - } - sum += vaddvq_f32(__sum); -#endif // __ARM_NEON__ - for (int i = 0; i < remain; ++i) { - float out = expf(input[i] - max); - sum += out; - output[i] = out; - } - - // exp(x - max) / sum - float inv_sum = 1.f / sum; - output = y; -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - float32x4_t __inv_sum = vdupq_n_f32(inv_sum); - for (int i = 0; i < loop; ++i, output += 8) { - float32x4_t x0 = vld1q_f32(output); - float32x4_t x1 = vld1q_f32(output + 4); - x0 = vmulq_f32(x0, __inv_sum); - x1 = vmulq_f32(x1, __inv_sum); - vst1q_f32(output, x0); - vst1q_f32(output + 4, x1); - } -#endif - for (int i = 0; i < remain; ++i) { - output[i] *= inv_sum; - } -} - -template <> -void SoftmaxFuntor::operator()(const framework::Tensor *X, - framework::Tensor *Y) { - const framework::DDim &dims = X->dims(); - int batch_size = dims[0]; - int num_classes = dims[dims.size() - 1]; - int channels = X->numel() / batch_size / num_classes; - const float *x = X->data(); - float *y = Y->mutable_data(); - - #pragma omp parallel for collapse(2) - for (int batch = 0; batch < X->dims()[0]; ++batch) { - for (int channel = 0; channel < channels; ++channel) { - size_t offset = (batch * channels + channel) * num_classes; - const float *input = x + offset; - float *output = y + offset; - SoftmaxBasic(input, num_classes, output); - } - } -} - -template <> -void SequenceSoftmaxFuntor::operator()( - const framework::LoDTensor *X, framework::LoDTensor *Y) { - const float *x = X->data(); - const auto &lod = X->lod().back(); - float *y = Y->mutable_data(); - - #pragma omp parallel for - for (int batch = 0; batch < lod.size() - 1; ++batch) { - int num_classes = lod[batch + 1] - lod[batch]; - size_t offset = lod[batch]; - const float *input = x + offset; - float *output = y + offset; - SoftmaxBasic(input, num_classes, output); - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // SOFTMAX_OP diff --git a/mobile/src/operators/math/softmax.h b/mobile/src/operators/math/softmax.h deleted file mode 100644 index dff25b9d02..0000000000 --- a/mobile/src/operators/math/softmax.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(SOFTMAX_OP) || defined(SEQUENCE_SOFTMAX_OP) - -#pragma once - -#include "framework/lod_tensor.h" -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -class SoftmaxFuntor { - public: - void operator()(const framework::Tensor *X, framework::Tensor *Y); -}; - -template -class SequenceSoftmaxFuntor { - public: - void operator()(const framework::LoDTensor *X, framework::LoDTensor *Y); -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/transform.h b/mobile/src/operators/math/transform.h deleted file mode 100644 index 7a31e12ef2..0000000000 --- a/mobile/src/operators/math/transform.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -namespace paddle_mobile { -namespace operators { -namespace math { - -// Transform applys a unary or a binary functor on each element in a -// range defined by a pair of iterators. -// -// - The specialization for CPU calls std::transform. -// - The specialization for CUDA calls thrust::tranform. -// -// NOTE: We need to define InputIter and OutputIter defined as -// different types, because the InputIter points op's inputs -// and -// OutputIter pints to op's outputs. -// -// NOTE: We don't assume that InputIter to be const InputType* and -// OutputIter to be OutputType*, because we might use a -// iterator -// class, paddle::fluid::operators::RowwiseTRansformIterator. - -struct Transform { - template - void operator()(InputIter first, InputIter last, OutputIter result, - UnaryOperation op) { - std::transform(first, last, result, op); - } - - template - void operator()(InputIter1 first1, InputIter1 last1, InputIter2 first2, - OutputIter result, BinaryOperation op) { - std::transform(first1, last1, first2, result, op); - } -}; -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/vol2col.cpp b/mobile/src/operators/math/vol2col.cpp deleted file mode 100644 index 9311e9e229..0000000000 --- a/mobile/src/operators/math/vol2col.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/math/vol2col.h" -#include - -namespace paddle_mobile { -namespace operators { -namespace math { - -using Tensor = paddle_mobile::framework::Tensor; -/* - * vol = [input_channels, input_depth, input_height, input_width] - * col = - * [input_channels, filter_depth, filter_height, filter_width, - * output_depth, output_height, output_width] - */ -template -class Vol2ColFunctor { - public: - void operator()(const Tensor &vol, const std::vector &dilations, - const std::vector &strides, - const std::vector &paddings, Tensor *col) const { - int input_channels = vol.dims()[0]; - int input_depth = vol.dims()[1]; - int input_height = vol.dims()[2]; - int input_width = vol.dims()[3]; - int filter_depth = col->dims()[1]; - int filter_height = col->dims()[2]; - int filter_width = col->dims()[3]; - int output_depth = col->dims()[4]; - int output_height = col->dims()[5]; - int output_width = col->dims()[6]; - int channels_col = - input_channels * filter_depth * filter_height * filter_width; - - const T *vol_data = vol.data(); - T *col_data = col->data(); - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int d_offset = (c / filter_width / filter_height) % filter_depth; - int c_in = c / filter_width / filter_height / filter_depth; - for (int d = 0; d < output_depth; ++d) { - int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0]; - for (int h = 0; h < output_height; ++h) { - int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1]; - for (int w = 0; w < output_width; ++w) { - int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2]; - - int col_idx = - ((c * output_depth + d) * output_height + h) * output_width + w; - int vol_idx = - ((c_in * input_depth + d_pad) * input_height + h_pad) * - input_width + - w_pad; - col_data[col_idx] = - (h_pad < 0 || h_pad >= input_height || w_pad < 0 || - w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) - ? static_cast(0) - : vol_data[vol_idx]; - } - } - } - } - } -}; - -/* - * vol = [input_channels,input_depth, input_height, input_width] - * col = - * [input_channels, filter_depth, filter_height, filter_width, - * output_depth, output_height, output_width] - */ -template -class Col2VolFunctor { - public: - void operator()(const Tensor &col, const std::vector &dilations, - const std::vector &strides, - const std::vector &paddings, Tensor *vol) const { - int input_channels = vol->dims()[0]; - int input_depth = vol->dims()[1]; - int input_height = vol->dims()[2]; - int input_width = vol->dims()[3]; - int filter_depth = col.dims()[1]; - int filter_height = col.dims()[2]; - int filter_width = col.dims()[3]; - int output_depth = col.dims()[4]; - int output_height = col.dims()[5]; - int output_width = col.dims()[6]; - int channels_col = - input_channels * filter_depth * filter_height * filter_width; - - T *vol_data = vol->data(); - const T *col_data = col.data(); - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int d_offset = (c / filter_width / filter_height) % filter_depth; - int cIm = c / filter_width / filter_height / filter_depth; - for (int d = 0; d < output_depth; ++d) { - int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0]; - for (int h = 0; h < output_height; ++h) { - int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1]; - for (int w = 0; w < output_width; ++w) { - int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2]; - - if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && - w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { - int vol_idx = - ((cIm * input_depth + d_pad) * input_height + h_pad) * - input_width + - w_pad; - - int col_idx = - ((c * output_depth + d) * output_height + h) * output_width + - w; - vol_data[vol_idx] += col_data[col_idx]; - } - } - } - } - } - } -}; - -template class Vol2ColFunctor; -template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/vol2col.h b/mobile/src/operators/math/vol2col.h deleted file mode 100644 index 772bdf809a..0000000000 --- a/mobile/src/operators/math/vol2col.h +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "common/types.h" -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { -/* - * \brief Converts the feature data of four dimensions(CDHW) into a - * colData of - * seven dimensions in the Vol2ColFunctor calculation, - * And in the Col2VolFunctor calculation, it is reversed. - * - * \param volData Vol data. - * \param volShape The shape of volData, - * [input_channels, input_depth, input_height, - * input_width]. - * \param colData Column data. - * \param colShape The shape of colData. - * - * \param dilations dilation data. - * \param 3-dimension [dilation_depth, dilation_height, - * dilation_width]. - * - * \param strides stride data. - * \param 3-dimension [stride_depth, stride_height, stride_width]. - * - * \param paddings padding data. - * \param 3-dimension [d_pad, h_pad, w_pad]. - * - * The shape of colData is: - * [input_channels, filter_depth, filter_height, filter_width, - * output_depth, - * output_height, output_width] - * So, it is easy to reshape into a convolution matrix for - * convolution - * calculation based on matrix multiplication. - * The shape of convolution matrix is [height, width], where the - * height is equal - * input_channels * filter_depth * filter_height * filter_width, and - * the width - * is equal output_depth * output_height * output_width. - * - * Reshape: - * shape of colData shape of convolution matrix - * [input_channels, - * filter_depth, - * filter_height, - * filter_width, ======> [height, width] - * output_depth, - * output_height, - * output_width] - * - * \note The caller needs to ensure that volShape.inputChannels is - * equal to - * colShape.inputChannels. - */ -using Tensor = paddle_mobile::framework::Tensor; - -template -class Vol2ColFunctor { - public: - void operator()(const Tensor &vol, const std::vector &dilations, - const std::vector &strides, - const std::vector &paddings, Tensor *col) const; -}; - -template -class Col2VolFunctor { - public: - void operator()(const Tensor &col, const std::vector &dilations, - const std::vector &strides, - const std::vector &paddings, Tensor *vol) const; -}; - -} // namespace math -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/math/winograd/winograd_transform.h b/mobile/src/operators/math/winograd/winograd_transform.h deleted file mode 100644 index 599a9b9233..0000000000 --- a/mobile/src/operators/math/winograd/winograd_transform.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#pragma once - -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template -void winograd_transform_weight(const framework::Tensor &weight, - framework::Tensor *output); - -template -void winograd_transform_input(const framework::Tensor &input, - framework::Tensor *output); - -template -void winograd_transform_output(const framework::Tensor &input, - const framework::Tensor &weight, - framework::Tensor *output); - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp b/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp deleted file mode 100644 index 4ba0ee4cb6..0000000000 --- a/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp +++ /dev/null @@ -1,1681 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn -// project. - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#ifdef CONV_OP - -#include -#include "operators/math/pad.h" -#include "operators/math/winograd/winograd_transform.h" - -namespace paddle_mobile { -namespace operators { -namespace math { - -template <> -void winograd_transform_weight<8, 3>(const framework::Tensor &weight, - framework::Tensor *output) { - /* - * w0 = g0 - * w1 = ((g0 + g2) + g1) * (-2.0 / 9) - * w2 = ((g0 + g2) - g1) * (-2.0 / 9) - * w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90) - * w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90) - * w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180) - * w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180) - * w7 = g2 - */ - // weight shape is [out_channel, in_channel, kernel_h, kernel_w] - // package weight into [roundup(out_channel/4), 64, in_channel, 4] tiles - int out_channel = weight.dims()[0]; - int in_channel = weight.dims()[1]; - // reshape and alloc transformed weight - framework::DDim transformed_shape = framework::make_ddim( - std::vector{(out_channel + 3) / 4, 64, in_channel, 4}); - float *trans_outptr = output->mutable_data(transformed_shape); - memset(trans_outptr, 0, output->numel() * sizeof(float)); - - const float transform_matrix[8] = {2.f, -2.f / 9, 1.f / 90, 1.f / 180}; - const float *inptr = weight.data(); - -#if __aarch64__ - int remain_start = 0; -#else - int remain_start = out_channel & 0xFFFFFFFC; - - #pragma omp parallel for - for (int oc = 0; oc < out_channel - 3; oc += 4) { - float gw[96]; // gw[3][8][4] - const float *inptr0 = inptr + oc * in_channel * 9; - const float *inptr1 = inptr + (oc + 1) * in_channel * 9; - const float *inptr2 = inptr + (oc + 2) * in_channel * 9; - const float *inptr3 = inptr + (oc + 3) * in_channel * 9; - // oc * 64 * in_channel - float *outptr = trans_outptr + ((oc * in_channel) << 6); - for (int ic = 0; ic < in_channel; ++ic) { - float *gw_ptr = gw; - asm volatile( - "vld1.32 {d0-d1}, [%[tm_ptr]] \n" - - "mov r0, #24 \n" - "vld1.32 {d2-d5}, [%[inptr0]], r0 \n" - "vld1.32 {d6-d9}, [%[inptr1]], r0 \n" - "vld1.32 {d10-d13}, [%[inptr2]], r0 \n" - "vld1.32 {d14-d17}, [%[inptr3]], r0 \n" - "vtrn.32 q1, q3 \n" - "vtrn.32 q2, q4 \n" - "vtrn.32 q5, q7 \n" - "vtrn.32 q6, q8 \n" - "vswp.32 d3, d10 \n" - "vswp.32 d7, d14 \n" - "vswp.32 d5, d12 \n" - "vswp.32 d9, d16 \n" - - // q1: g0, q3: g1, q5: g2 - "vst1.32 {d2-d3}, [%[gw_ptr]]! \n" - "vadd.f32 q9, q1, q5 \n" - "vadd.f32 q10, q9, q3 \n" - "vsub.f32 q11, q9, q3 \n" - "vmul.f32 q10, q10, d0[1] \n" - "vst1.32 {d20-d21}, [%[gw_ptr]]! \n" - "vmul.f32 q11, q11, d0[1] \n" - "vst1.32 {d22-d23}, [%[gw_ptr]]! \n" - - "vmul.f32 q9, q1, d0[0] \n" - "vmul.f32 q9, q9, d0[0] \n" // 4 * g0 - "vmul.f32 q10, q3, d0[0] \n" // 2 * g1 - "vmul.f32 q11, q5, d0[0] \n" - "vmul.f32 q11, q11, d0[0] \n" // 4 * g2 - - "vadd.f32 q12, q1, q11 \n" - "vadd.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[0] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - "vsub.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[0] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - - "vadd.f32 q12, q5, q9 \n" - "vadd.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[1] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - "vsub.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[1] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - - "vst1.32 {d10-d11}, [%[gw_ptr]]! \n" - - // q7: g0, q2: g1, q4: g2 - "vst1.32 {d14-d15}, [%[gw_ptr]]! \n" - "vadd.f32 q9, q7, q4 \n" - "vadd.f32 q10, q9, q2 \n" - "vsub.f32 q11, q9, q2 \n" - "vmul.f32 q10, q10, d0[1] \n" - "vst1.32 {d20-d21}, [%[gw_ptr]]! \n" - "vmul.f32 q11, q11, d0[1] \n" - "vst1.32 {d22-d23}, [%[gw_ptr]]! \n" - - "vmul.f32 q9, q7, d0[0] \n" - "vmul.f32 q9, q9, d0[0] \n" // 4 * g0 - "vmul.f32 q10, q2, d0[0] \n" // 2 * g1 - "vmul.f32 q11, q4, d0[0] \n" - "vmul.f32 q11, q11, d0[0] \n" // 4 * g2 - - "vadd.f32 q12, q7, q11 \n" - "vadd.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[0] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - "vsub.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[0] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - - "vadd.f32 q12, q4, q9 \n" - "vadd.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[1] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - "vsub.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[1] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - - "vst1.32 {d8-d9}, [%[gw_ptr]]! \n" - - "mov r0, #12 \n" - "vld1.32 {d2-d3}, [%[inptr0]], r0 \n" - "vld1.32 {d6-d7}, [%[inptr1]], r0 \n" - "vld1.32 {d10-d11}, [%[inptr2]], r0 \n" - "vld1.32 {d14-d15}, [%[inptr3]], r0 \n" - "vtrn.32 q1, q3 \n" - "vtrn.32 q5, q7 \n" - "vswp.32 d3, d10 \n" - "vswp.32 d7, d14 \n" - - // q1: g0, q3: g1, q5: g2 - "vst1.32 {d2-d3}, [%[gw_ptr]]! \n" - "vadd.f32 q9, q1, q5 \n" - "vadd.f32 q10, q9, q3 \n" - "vsub.f32 q11, q9, q3 \n" - "vmul.f32 q10, q10, d0[1] \n" - "vst1.32 {d20-d21}, [%[gw_ptr]]! \n" - "vmul.f32 q11, q11, d0[1] \n" - "vst1.32 {d22-d23}, [%[gw_ptr]]! \n" - - "vmul.f32 q9, q1, d0[0] \n" - "vmul.f32 q9, q9, d0[0] \n" // 4 * g0 - "vmul.f32 q10, q3, d0[0] \n" // 2 * g1 - "vmul.f32 q11, q5, d0[0] \n" - "vmul.f32 q11, q11, d0[0] \n" // 4 * g2 - - "vadd.f32 q12, q1, q11 \n" - "vadd.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[0] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - "vsub.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[0] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - - "vadd.f32 q12, q5, q9 \n" - "vadd.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[1] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - "vsub.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[1] \n" - "vst1.32 {d26-d27}, [%[gw_ptr]]! \n" - - "vst1.32 {d10-d11}, [%[gw_ptr]]! \n" - : [gw_ptr] "+r"(gw_ptr), [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), - [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3) - : [tm_ptr] "r"((float *)transform_matrix) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "r0"); - - float *gw_ptr0 = gw; - float *gw_ptr1 = gw + 32; - float *gw_ptr2 = gw + 64; - float *outptr0 = outptr + (ic << 2); // ic * 4 - int steps = (in_channel << 2) * sizeof(float); // in_channel * 4 - asm volatile( - "vld1.32 {d0-d1}, [%[tm_ptr]] \n" - "mov r0, #8 \n" - - "loop_8_%=: \n" - "vld1.32 {d2-d3}, [%[gw_ptr0]]! \n" - "vld1.32 {d4-d5}, [%[gw_ptr1]]! \n" - "vld1.32 {d6-d7}, [%[gw_ptr2]]! \n" - - // q1: g0, q2: g1, q3: g2 - "vst1.32 {d2-d3}, [%[outptr0]], %[steps] \n" - "vadd.f32 q9, q1, q3 \n" - "vadd.f32 q10, q9, q2 \n" - "vsub.f32 q11, q9, q2 \n" - "vmul.f32 q10, q10, d0[1] \n" - "vst1.32 {d20-d21}, [%[outptr0]], %[steps] \n" - "vmul.f32 q11, q11, d0[1] \n" - "vst1.32 {d22-d23}, [%[outptr0]], %[steps] \n" - - "vmul.f32 q9, q1, d0[0] \n" - "vmul.f32 q9, q9, d0[0] \n" // 4 * g0 - "vmul.f32 q10, q2, d0[0] \n" // 2 * g1 - "vmul.f32 q11, q3, d0[0] \n" - "vmul.f32 q11, q11, d0[0] \n" // 4 * g2 - - "vadd.f32 q12, q1, q11 \n" - "vadd.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[0] \n" - "vst1.32 {d26-d27}, [%[outptr0]], %[steps] \n" - "vsub.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[0] \n" - "vst1.32 {d26-d27}, [%[outptr0]], %[steps] \n" - - // w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180) - "vadd.f32 q12, q3, q9 \n" - "vadd.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[1] \n" - "vst1.32 {d26-d27}, [%[outptr0]], %[steps] \n" - "vsub.f32 q13, q12, q10 \n" - "vmul.f32 q13, q13, d1[1] \n" - "vst1.32 {d26-d27}, [%[outptr0]], %[steps] \n" - - "vst1.32 {d6-d7}, [%[outptr0]], %[steps] \n" - - "subs r0, #1 \n" - "bne loop_8_%= \n" - : [outptr0] "+r"(outptr0), [gw_ptr0] "+r"(gw_ptr0), - [gw_ptr1] "+r"(gw_ptr1), [gw_ptr2] "+r"(gw_ptr2) - : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps) - : "cc", "memory", "q0", "q1", "q2", "q3", "q9", "q10", "q11", "q12", - "q13", "r0"); - } - } -#endif // __aarch64__ - - // remain output channel - #pragma omp parallel for - for (int oc = remain_start; oc < out_channel; ++oc) { - float gw[3][8]; // gw[3][8] - const float *inptr0 = inptr + oc * in_channel * 9; // - // (oc / 4) * 64 * in_channel * 4 + oc % 4 - int offset = ((oc & 0xFFFFFFFC) << 6) * in_channel + (oc & 0x3); - int steps = (in_channel << 2); // in_channel * 4 - float *outptr = trans_outptr + offset; - for (int ic = 0; ic < in_channel; ++ic) { - for (int i = 0; i < 3; ++i, inptr0 += 3) { - float g0 = inptr0[0]; - float g1 = inptr0[1]; - float g2 = inptr0[2]; - float d0 = g0 + g2; - float d1 = g0 + 4 * g2; - float d2 = g2 + 4 * g0; - float d3 = 2 * g1; - gw[i][0] = g0; - gw[i][1] = -2.f / 9 * (d0 + g1); // -2.f/9 * (g0 + g1 + g2) - gw[i][2] = -2.f / 9 * (d0 - g1); // -2.f/9 * (g0 - g1 + g2) - gw[i][3] = 1.f / 90 * (d1 + d3); // 1.f/90 * (g0 + 2 * g1 + 4 * g2) - gw[i][4] = 1.f / 90 * (d1 - d3); // 1.f/90 * (g0 - 2 * g1 + 4 * g2) - gw[i][5] = 1.f / 180 * (d2 + d3); // 1.f/180 * (4 * g0 + 2 * g1 + g2) - gw[i][6] = 1.f / 180 * (d2 - d3); // 1.f/180 * (4 * g0 - 2 * g1 + g2) - gw[i][7] = g2; - } - for (int i = 0; i < 8; ++i) { - float g0 = gw[0][i]; - float g1 = gw[1][i]; - float g2 = gw[2][i]; - float d0 = g0 + g2; - float d1 = g0 + 4 * g2; - float d2 = g2 + 4 * g0; - float d3 = 2 * g1; - int offset = i * 8 * steps; - outptr[offset] = g0; - outptr[offset + 1 * steps] = -2.f / 9 * (d0 + g1); - outptr[offset + 2 * steps] = -2.f / 9 * (d0 - g1); - outptr[offset + 3 * steps] = 1.f / 90 * (d1 + d3); - outptr[offset + 4 * steps] = 1.f / 90 * (d1 - d3); - outptr[offset + 5 * steps] = 1.f / 180 * (d2 + d3); - outptr[offset + 6 * steps] = 1.f / 180 * (d2 - d3); - outptr[offset + 7 * steps] = g2; - } - outptr += 4; - } - } -} - -template <> -void winograd_transform_input<8, 3>(const framework::Tensor &input, - framework::Tensor *output) { - /* - * x0 = (d0 - d6) + (d4 - d2) * 5.25 - * x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5) - * x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5) - * x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5) - * x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5) - * x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5) - * x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5) - * x7 = (d7 - d1) + (d3 - d5) * 5.25 - */ - // package input into [roundup(tiles/8), 64, channel, 8] tiles - int channel = input.dims()[1]; - int height = input.dims()[2]; - int width = input.dims()[3]; - int h_tiles = (height + 3) / 6; // (height - 2 + 5) / 6 - int w_tiles = (width + 3) / 6; // (width - 2 + 5) / 6 - int tiles = (h_tiles * w_tiles + 7) / 8; - framework::DDim transformed_shape = - framework::make_ddim(std::vector{tiles, 64, channel, 8}); - float *outptr = output->mutable_data(transformed_shape); - memset(outptr, 0, output->numel() * sizeof(float)); - - const float *inptr = input.data(); - height = h_tiles * 6 + 2; - width = w_tiles * 6 + 2; - framework::Tensor input_pad; - if (height > input.dims()[2] || width > input.dims()[3]) { - framework::DDim input_shape = - framework::make_ddim(std::vector{1, channel, height, width}); - PadFunctor pad; - inptr = input_pad.mutable_data(input_shape); - pad(input, 0, height - input.dims()[2], 0, width - input.dims()[3], - &input_pad); - } - size_t image_size = height * width; - const float transform_matrix[8] = {5.25f, -5.f, -4.25f, -2.5f, - 2.f, -1.25f, 0.5f, 0.25f}; - #pragma omp parallel for - for (int c = 0; c < channel; ++c) { - const float *in = inptr + c * image_size; - float d_bt[64]; // d * B_t - for (int h = 0; h < h_tiles; ++h) { - for (int w = 0; w < w_tiles; ++w) { - const float *in0 = in + (h * width + w) * 6; - const float *in1 = in0 + width; - const float *in2 = in1 + width; - const float *in3 = in2 + width; - float *d_bt_ptr = d_bt; -#if __aarch64__ - int steps = 4 * width; - float32x4_t _q0 = vld1q_f32(transform_matrix); - float32x4_t _q1 = vld1q_f32(transform_matrix + 4); - for (int l = 0; l < 2; ++l) { - float32x4x2_t _q23, _q45, _q67, _q89; - _q23.val[0] = vld1q_f32(in0); - _q45.val[0] = vld1q_f32(in0 + 4); - _q23.val[1] = vld1q_f32(in1); - _q45.val[1] = vld1q_f32(in1 + 4); - _q67.val[0] = vld1q_f32(in2); - _q89.val[0] = vld1q_f32(in2 + 4); - _q67.val[1] = vld1q_f32(in3); - _q89.val[1] = vld1q_f32(in3 + 4); - _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]); - _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]); - _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]); - _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]); - float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[0]), - vget_low_f32(_q67.val[0])); - float32x4_t _q4 = vcombine_f32(vget_low_f32(_q23.val[1]), - vget_low_f32(_q67.val[1])); - float32x4_t _q3 = vcombine_f32(vget_low_f32(_q45.val[0]), - vget_low_f32(_q89.val[0])); - float32x4_t _q5 = vcombine_f32(vget_low_f32(_q45.val[1]), - vget_low_f32(_q89.val[1])); - float32x4_t _q6 = vcombine_f32(vget_high_f32(_q23.val[0]), - vget_high_f32(_q67.val[0])); - float32x4_t _q8 = vcombine_f32(vget_high_f32(_q23.val[1]), - vget_high_f32(_q67.val[1])); - float32x4_t _q7 = vcombine_f32(vget_high_f32(_q45.val[0]), - vget_high_f32(_q89.val[0])); - float32x4_t _q9 = vcombine_f32(vget_high_f32(_q45.val[1]), - vget_high_f32(_q89.val[1])); - - float32x4_t _q10 = vsubq_f32(_q2, _q7); - float32x4_t _q11 = vsubq_f32(_q3, _q6); - _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0); - vst1q_f32(d_bt_ptr, _q10); - - _q10 = vaddq_f32(_q6, _q7); - _q11 = vaddq_f32(_q4, _q5); - _q10 = vmlaq_lane_f32(_q10, _q3, vget_high_f32(_q0), 0); - _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 0); - float32x4_t _q12 = vaddq_f32(_q10, _q11); - float32x4_t _q13 = vsubq_f32(_q10, _q11); - vst1q_f32(d_bt_ptr + 4, _q12); - vst1q_f32(d_bt_ptr + 8, _q13); - - _q10 = vmulq_lane_f32(_q6, vget_high_f32(_q1), 1); - _q11 = vmulq_lane_f32(_q4, vget_high_f32(_q1), 0); - _q10 = vaddq_f32(_q10, _q7); - _q11 = vmlaq_lane_f32(_q11, _q5, vget_low_f32(_q1), 0); - _q10 = vmlaq_lane_f32(_q10, _q3, vget_low_f32(_q1), 1); - _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 1); - _q12 = vaddq_f32(_q10, _q11); - _q13 = vsubq_f32(_q10, _q11); - vst1q_f32(d_bt_ptr + 12, _q12); - vst1q_f32(d_bt_ptr + 16, _q13); - - _q10 = vmulq_lane_f32(_q6, vget_low_f32(_q1), 0); - _q11 = vmulq_lane_f32(_q4, vget_low_f32(_q1), 0); - _q10 = vmlaq_lane_f32(_q10, _q3, vget_high_f32(_q0), 1); - _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 1); - _q10 = vmlaq_lane_f32(_q10, _q7, vget_high_f32(_q1), 0); - _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q1), 0); - _q10 = vmulq_lane_f32(_q10, vget_low_f32(_q1), 0); - _q12 = vaddq_f32(_q10, _q11); - _q13 = vsubq_f32(_q10, _q11); - vst1q_f32(d_bt_ptr + 20, _q12); - vst1q_f32(d_bt_ptr + 24, _q13); - - _q10 = vsubq_f32(_q9, _q4); - _q11 = vsubq_f32(_q8, _q5); - _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0); - vst1q_f32(d_bt_ptr + 28, _q10); - - in0 += steps; - in1 += steps; - in2 += steps; - in3 += steps; - d_bt_ptr += 32; - } -#else - int steps = 4 * width * sizeof(float); - asm volatile( - "vld1.32 {d0-d3}, [%[tm_ptr]] \n" - "mov r0, #2 \n" - // row loop - "loop_r_%=: \n" - "vld1.32 {d4-d7}, [%[in0]], %[steps] \n" - "vld1.32 {d8-d11}, [%[in1]], %[steps] \n" - "vld1.32 {d12-d15}, [%[in2]], %[steps] \n" - "vld1.32 {d16-d19}, [%[in3]], %[steps] \n" - "vtrn.32 q2, q4 \n" // d0: q2 - "vtrn.32 q3, q5 \n" // d1: q4 - "vtrn.32 q6, q8 \n" // d2: q6 - "vtrn.32 q7, q9 \n" // d3: q8 - "vswp.32 d5, d12 \n" // d4: q3 - "vswp.32 d9, d16 \n" // d5: q5 - "vswp.32 d7, d14 \n" // d6: q7 - "vswp.32 d11, d18 \n" // d7: q9 - - "vsub.f32 q10, q2, q7 \n" - "vsub.f32 q11, q3, q6 \n" - "vmla.f32 q10, q11, d0[0] \n" // d0 - d6 + (d4 - - // d2) * 5.25" - "vst1.32 {d20-d21}, [%[d_bt]]! \n" - - "vadd.f32 q10, q6, q7 \n" - "vadd.f32 q11, q4, q5 \n" - "vmla.f32 q10, q3, d1[0] \n" // d2 - 4.25 * d4 + - // d6 - "vmla.f32 q11, q8, d1[0] \n" // d1 - 4.25 * d3 + - // d5 - "vadd.f32 q12, q10, q11 \n" - "vsub.f32 q13, q10, q11 \n" - "vst1.32 {d24-d27}, [%[d_bt]]! \n" - - "vmul.f32 q10, q6, d3[1] \n" // 0.25 * d2 - "vmul.f32 q11, q4, d3[0] \n" // 0.5 * d1 - "vadd.f32 q10, q10, q7 \n" // 0.25 * d2 + d6 - "vmla.f32 q11, q5, d2[0] \n" // 0.5 * d1 + 2 * - // d5 - "vmla.f32 q10, q3, d2[1] \n" // 0.25 * d2 + d6 - // - 1.25 * d4 - "vmla.f32 q11, q8, d1[1] \n" // 0.5 * d1 + 2 * - // d5 - 2.5 * d3 - "vadd.f32 q12, q10, q11 \n" - "vsub.f32 q13, q10, q11 \n" - "vst1.32 {d24-d27}, [%[d_bt]]! \n" - - "vmul.f32 q10, q6, d2[0] \n" // 2 * d2 - "vmul.f32 q11, q4, d2[0] \n" // 2 * d1 - "vmla.f32 q10, q3, d1[1] \n" // 2 * d2 - 2.5 * - // d4 - "vmla.f32 q11, q8, d1[1] \n" // 2 * d1 - 2.5 * - // d3 - "vmla.f32 q10, q7, d3[0] \n" // 2 * d1 - 2.5 * - // d3 + 0.5 * d6 - "vmla.f32 q11, q5, d3[0] \n" // 2 * d2 - 2.5 * - // d4 + 0.5 * d5 - "vmul.f32 q10, q10, d2[0] \n" // 4 * d1 - 5 * d3 - // + d6 - "vadd.f32 q12, q10, q11 \n" - "vsub.f32 q13, q10, q11 \n" - "vst1.32 {d24-d27}, [%[d_bt]]! \n" - - "vsub.f32 q10, q9, q4 \n" - "vsub.f32 q11, q8, q5 \n" - "vmla.f32 q10, q11, d0[0] \n" - "vst1.32 {d20-d21}, [%[d_bt]]! \n" - - "subs r0, #1 \n" - "bne loop_r_%= \n" - : [d_bt] "+r"(d_bt_ptr), [in0] "+r"(in0), [in1] "+r"(in1), - [in2] "+r"(in2), [in3] "+r"(in3) - : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "r0"); -#endif // __aarch64__ - float *ptr0 = d_bt; - float *ptr1 = ptr0 + 32; - int tile_indics = h * w_tiles + w; - int tile_block = tile_indics >> 3; - int block_indics = tile_indics & 0x7; - // (tiles / 8, 64, channel, 8) - float *out0 = - outptr + (tile_block * 64 * channel + c) * 8 + block_indics; - float *out1 = out0 + channel * 8; - float *out2 = out1 + channel * 8; - float *out3 = out2 + channel * 8; - float *out4 = out3 + channel * 8; - float *out5 = out4 + channel * 8; - float *out6 = out5 + channel * 8; - float *out7 = out6 + channel * 8; -#if __aarch64__ - steps = 8 * channel * 8; - for (int l = 0; l < 2; ++l) { - float32x4x2_t _q23, _q45, _q67, _q89; - _q23.val[0] = vld1q_f32(ptr0); - _q23.val[1] = vld1q_f32(ptr0 + 4); - _q45.val[0] = vld1q_f32(ptr0 + 8); - _q45.val[1] = vld1q_f32(ptr0 + 12); - _q67.val[0] = vld1q_f32(ptr1); - _q67.val[1] = vld1q_f32(ptr1 + 4); - _q89.val[0] = vld1q_f32(ptr1 + 8); - _q89.val[1] = vld1q_f32(ptr1 + 12); - _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]); - _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]); - _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]); - _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]); - float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[0]), - vget_low_f32(_q45.val[0])); - float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[0]), - vget_high_f32(_q45.val[0])); - float32x4_t _q3 = vcombine_f32(vget_low_f32(_q23.val[1]), - vget_low_f32(_q45.val[1])); - float32x4_t _q5 = vcombine_f32(vget_high_f32(_q23.val[1]), - vget_high_f32(_q45.val[1])); - float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[0]), - vget_low_f32(_q89.val[0])); - float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[0]), - vget_high_f32(_q89.val[0])); - float32x4_t _q7 = vcombine_f32(vget_low_f32(_q67.val[1]), - vget_low_f32(_q89.val[1])); - float32x4_t _q9 = vcombine_f32(vget_high_f32(_q67.val[1]), - vget_high_f32(_q89.val[1])); - - float32x4_t _q10 = vsubq_f32(_q2, _q8); - float32x4_t _q11 = vsubq_f32(_q6, _q4); - _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0); - vst1q_lane_f32(out0, _q10, 0); - vst1q_lane_f32(out0 + steps, _q10, 1); - vst1q_lane_f32(out0 + 2 * steps, _q10, 2); - vst1q_lane_f32(out0 + 3 * steps, _q10, 3); - - _q10 = vaddq_f32(_q4, _q8); - _q11 = vaddq_f32(_q3, _q7); - _q10 = vmlaq_lane_f32(_q10, _q6, vget_high_f32(_q0), 0); - _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 0); - float32x4_t _q12 = vaddq_f32(_q10, _q11); - vst1q_lane_f32(out1, _q12, 0); - vst1q_lane_f32(out1 + steps, _q12, 1); - vst1q_lane_f32(out1 + 2 * steps, _q12, 2); - vst1q_lane_f32(out1 + 3 * steps, _q12, 3); - - _q12 = vsubq_f32(_q10, _q11); - vst1q_lane_f32(out2, _q12, 0); - vst1q_lane_f32(out2 + steps, _q12, 1); - vst1q_lane_f32(out2 + 2 * steps, _q12, 2); - vst1q_lane_f32(out2 + 3 * steps, _q12, 3); - - _q10 = vmulq_lane_f32(_q4, vget_high_f32(_q1), 1); - _q11 = vmulq_lane_f32(_q3, vget_high_f32(_q1), 0); - _q10 = vaddq_f32(_q10, _q8); - _q11 = vmlaq_lane_f32(_q11, _q7, vget_low_f32(_q1), 0); - _q10 = vmlaq_lane_f32(_q10, _q6, vget_low_f32(_q1), 1); - _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 1); - _q12 = vaddq_f32(_q10, _q11); - vst1q_lane_f32(out3, _q12, 0); - vst1q_lane_f32(out3 + steps, _q12, 1); - vst1q_lane_f32(out3 + 2 * steps, _q12, 2); - vst1q_lane_f32(out3 + 3 * steps, _q12, 3); - - _q12 = vsubq_f32(_q10, _q11); - vst1q_lane_f32(out4, _q12, 0); - vst1q_lane_f32(out4 + steps, _q12, 1); - vst1q_lane_f32(out4 + 2 * steps, _q12, 2); - vst1q_lane_f32(out4 + 3 * steps, _q12, 3); - - _q10 = vmulq_lane_f32(_q4, vget_low_f32(_q1), 0); - _q11 = vmulq_lane_f32(_q3, vget_low_f32(_q1), 0); - _q10 = vmlaq_lane_f32(_q10, _q6, vget_high_f32(_q0), 1); - _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 1); - _q10 = vmlaq_lane_f32(_q10, _q8, vget_high_f32(_q1), 0); - _q11 = vmlaq_lane_f32(_q11, _q7, vget_high_f32(_q1), 0); - _q10 = vmulq_lane_f32(_q10, vget_low_f32(_q1), 0); - _q12 = vaddq_f32(_q10, _q11); - vst1q_lane_f32(out5, _q12, 0); - vst1q_lane_f32(out5 + steps, _q12, 1); - vst1q_lane_f32(out5 + 2 * steps, _q12, 2); - vst1q_lane_f32(out5 + 3 * steps, _q12, 3); - - _q12 = vsubq_f32(_q10, _q11); - vst1q_lane_f32(out6, _q12, 0); - vst1q_lane_f32(out6 + steps, _q12, 1); - vst1q_lane_f32(out6 + 2 * steps, _q12, 2); - vst1q_lane_f32(out6 + 3 * steps, _q12, 3); - - _q10 = vsubq_f32(_q9, _q3); - _q11 = vsubq_f32(_q5, _q7); - _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0); - vst1q_lane_f32(out7, _q10, 0); - vst1q_lane_f32(out7 + steps, _q10, 1); - vst1q_lane_f32(out7 + 2 * steps, _q10, 2); - vst1q_lane_f32(out7 + 3 * steps, _q10, 3); - - ptr0 += 16; - ptr1 += 16; - out0 += 4 * steps; - out1 += 4 * steps; - out2 += 4 * steps; - out3 += 4 * steps; - out4 += 4 * steps; - out5 += 4 * steps; - out6 += 4 * steps; - out7 += 4 * steps; - } -#else - steps = 8 * channel * 8 * sizeof(float); - asm volatile( - "mov r0, #2 \n" - "vld1.32 {d0-d3}, [%[tm_ptr]] \n" - // row loop - "loop_r_%=: \n" - "vld1.32 {d4-d7}, [%[ptr0]]! \n" // q2: d0, q3: d1 - "vld1.32 {d8-d11}, [%[ptr0]]! \n" // q4: d2, q5: d3 - "vld1.32 {d12-d15}, [%[ptr1]]! \n" // q6: d4, q7: d5 - "vld1.32 {d16-d19}, [%[ptr1]]! \n" // q8: d6, q9: d7 - "vtrn.32 q2, q3 \n" - "vtrn.32 q4, q5 \n" - "vtrn.32 q6, q7 \n" - "vtrn.32 q8, q9 \n" - "vswp.32 d5, d8 \n" - "vswp.32 d7, d10 \n" - "vswp.32 d13, d16 \n" - "vswp.32 d15, d18 \n" - - "vsub.f32 q10, q2, q8 \n" // d0 - d6 - "vsub.f32 q11, q6, q4 \n" // d4 - d2 - "vmla.f32 q10, q11, d0[0] \n" // d0 - d6 + (d4 - - // d2) * 5.25 - "vst1.32 {d20[0]}, [%[out0]], %[steps] \n" - "vst1.32 {d20[1]}, [%[out0]], %[steps] \n" - "vst1.32 {d21[0]}, [%[out0]], %[steps] \n" - "vst1.32 {d21[1]}, [%[out0]], %[steps] \n" - - "vadd.f32 q10, q4, q8 \n" - "vadd.f32 q11, q3, q7 \n" - "vmla.f32 q10, q6, d1[0] \n" // d2 - 4.25 * d4 + - // d6 - "vmla.f32 q11, q5, d1[0] \n" // d1 - 4.25 * d3 + - // d5 - "vadd.f32 q12, q10, q11 \n" - "vst1.32 {d24[0]}, [%[out1]], %[steps] \n" - "vst1.32 {d24[1]}, [%[out1]], %[steps] \n" - "vst1.32 {d25[0]}, [%[out1]], %[steps] \n" - "vst1.32 {d25[1]}, [%[out1]], %[steps] \n" - "vsub.f32 q12, q10, q11 \n" - "vst1.32 {d24[0]}, [%[out2]], %[steps] \n" - "vst1.32 {d24[1]}, [%[out2]], %[steps] \n" - "vst1.32 {d25[0]}, [%[out2]], %[steps] \n" - "vst1.32 {d25[1]}, [%[out2]], %[steps] \n" - - "vmul.f32 q10, q4, d3[1] \n" // 0.25 * d2 - "vmul.f32 q11, q3, d3[0] \n" // 0.5 * d1 - "vadd.f32 q10, q10, q8 \n" // 0.25 * d2 + d6 - "vmla.f32 q11, q7, d2[0] \n" // 0.5 * d1 + 2 * - // d5 - "vmla.f32 q10, q6, d2[1] \n" // 0.25 * d2 + d6 - // - 1.25 * d4 - "vmla.f32 q11, q5, d1[1] \n" // 0.5 * d1 + 2 * - // d5 - 2.5 * d3 - "vadd.f32 q12, q10, q11 \n" - "vst1.32 {d24[0]}, [%[out3]], %[steps] \n" - "vst1.32 {d24[1]}, [%[out3]], %[steps] \n" - "vst1.32 {d25[0]}, [%[out3]], %[steps] \n" - "vst1.32 {d25[1]}, [%[out3]], %[steps] \n" - "vsub.f32 q12, q10, q11 \n" - "vst1.32 {d24[0]}, [%[out4]], %[steps] \n" - "vst1.32 {d24[1]}, [%[out4]], %[steps] \n" - "vst1.32 {d25[0]}, [%[out4]], %[steps] \n" - "vst1.32 {d25[1]}, [%[out4]], %[steps] \n" - - "vmul.f32 q10, q4, d2[0] \n" // 2 * d2 - "vmul.f32 q11, q3, d2[0] \n" // 2 * d1 - "vmla.f32 q10, q6, d1[1] \n" // 2 * d2 - 2.5 * - // d4 - "vmla.f32 q11, q5, d1[1] \n" // 2 * d1 - 2.5 * - // d3 - "vmla.f32 q10, q8, d3[0] \n" // 2 * d1 - 2.5 * - // d3 + 0.5 * d6 - "vmla.f32 q11, q7, d3[0] \n" // 2 * d2 - 2.5 * - // d4 + 0.5 * d5 - "vmul.f32 q10, q10, d2[0] \n" // 4 * d1 - 5 * d3 - // + d6 - "vadd.f32 q12, q10, q11 \n" - "vst1.32 {d24[0]}, [%[out5]], %[steps] \n" - "vst1.32 {d24[1]}, [%[out5]], %[steps] \n" - "vst1.32 {d25[0]}, [%[out5]], %[steps] \n" - "vst1.32 {d25[1]}, [%[out5]], %[steps] \n" - "vsub.f32 q12, q10, q11 \n" - "vst1.32 {d24[0]}, [%[out6]], %[steps] \n" - "vst1.32 {d24[1]}, [%[out6]], %[steps] \n" - "vst1.32 {d25[0]}, [%[out6]], %[steps] \n" - "vst1.32 {d25[1]}, [%[out6]], %[steps] \n" - - "vsub.f32 q10, q9, q3 \n" - "vsub.f32 q11, q5, q7 \n" - "vmla.f32 q10, q11, d0[0] \n" - "vst1.32 {d20[0]}, [%[out7]], %[steps] \n" - "vst1.32 {d20[1]}, [%[out7]], %[steps] \n" - "vst1.32 {d21[0]}, [%[out7]], %[steps] \n" - "vst1.32 {d21[1]}, [%[out7]], %[steps] \n" - - "subs r0, #1 \n" - "bne loop_r_%= \n" - : [out0] "+r"(out0), [out1] "+r"(out1), [out2] "+r"(out2), - [out3] "+r"(out3), [out4] "+r"(out4), [out5] "+r"(out5), - [out6] "+r"(out6), [out7] "+r"(out7), [ptr0] "+r"(ptr0), - [ptr1] "+r"(ptr1) - : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "r0"); -#endif // __aarch64__ - } - } - } -} - -template <> -void winograd_transform_output<8, 3>(const framework::Tensor &input, - const framework::Tensor &weight, - framework::Tensor *output) { - // weight shape is [out_channel/4, 64, in_channel, 4], - // input shape is [hw/8, 64, in_channel, 8] - int tiles = input.dims()[0]; - int in_channel = input.dims()[2]; - int out_channel = weight.dims()[0]; - - // compute U*V first - framework::Tensor uv_trans; - framework::DDim shape = - framework::make_ddim(std::vector{out_channel, tiles, 64, 32}); - float *uv_trans_ptr = uv_trans.mutable_data(shape); - const float *input_ptr = input.data(); - const float *weight_ptr = weight.data(); - - #pragma omp parallel for - for (int i = 0; i < out_channel; ++i) { - float *uv_ptr = uv_trans_ptr + (i * tiles * 64 * 32); - for (int j = 0; j < tiles; ++j) { - for (int k = 0; k < 64; ++k) { - const float *w_ptr = weight_ptr + (i * 64 + k) * in_channel * 4; - const float *in_ptr = input_ptr + (j * 64 + k) * in_channel * 8; - int inter_channel = in_channel >> 1; - int remain_channel = in_channel & 0x1; -#if __aarch64__ - asm volatile( - "dup v8.4s, wzr \n" - "dup v9.4s, wzr \n" - "dup v10.4s, wzr \n" - "dup v11.4s, wzr \n" - "dup v12.4s, wzr \n" - "dup v13.4s, wzr \n" - "dup v14.4s, wzr \n" - "dup v15.4s, wzr \n" - - "cmp %[inter], #0 \n" - "ble 2f \n" - // loop 2 channels - "1: \n" - "ld1 {v0.4s, v1.4s}, [%[w_ptr]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[in_ptr]], #32 \n" - "ld1 {v4.4s, v5.4s}, [%[in_ptr]], #32 \n" - - "fmla v8.4s, v2.4s, v0.s[0] \n" - "fmla v9.4s, v3.4s, v0.s[0] \n" - "fmla v10.4s, v2.4s, v0.s[1] \n" - "fmla v11.4s, v3.4s, v0.s[1] \n" - "fmla v12.4s, v2.4s, v0.s[2] \n" - "fmla v13.4s, v3.4s, v0.s[2] \n" - "fmla v14.4s, v2.4s, v0.s[3] \n" - "fmla v15.4s, v3.4s, v0.s[3] \n" - - "fmla v8.4s, v4.4s, v1.s[0] \n" - "fmla v9.4s, v5.4s, v1.s[0] \n" - "fmla v10.4s, v4.4s, v1.s[1] \n" - "fmla v11.4s, v5.4s, v1.s[1] \n" - "fmla v12.4s, v4.4s, v1.s[2] \n" - "fmla v13.4s, v5.4s, v1.s[2] \n" - "fmla v14.4s, v4.4s, v1.s[3] \n" - "fmla v15.4s, v5.4s, v1.s[3] \n" - - "subs %[inter], %[inter], #1 \n" - "bne 1b \n" - - // loop 1 channel - "2: \n" - "cmp %[remain], #0 \n" - "ble 3f \n" - - "ld1 {v0.4s, v1.4s}, [%[w_ptr]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[in_ptr]], #32 \n" - "fmla v8.4s, v2.4s, v0.s[0] \n" - "fmla v9.4s, v3.4s, v0.s[0] \n" - "fmla v10.4s, v2.4s, v0.s[1] \n" - "fmla v11.4s, v3.4s, v0.s[1] \n" - "fmla v12.4s, v2.4s, v0.s[2] \n" - "fmla v13.4s, v3.4s, v0.s[2] \n" - "fmla v14.4s, v2.4s, v0.s[3] \n" - "fmla v15.4s, v3.4s, v0.s[3] \n" - - "3: \n" - "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[uv_ptr]], #64 \n" - "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[uv_ptr]], #64 \n" - : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr), - [inter] "+r"(inter_channel) - : [remain] "r"(remain_channel) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); -#else - asm volatile( - "veor q8, q8, q8 \n" - "veor q9, q9, q9 \n" - "veor q10, q10, q10 \n" - "veor q11, q11, q11 \n" - "veor q12, q12, q12 \n" - "veor q13, q13, q13 \n" - "veor q14, q14, q14 \n" - "veor q15, q15, q15 \n" - - "cmp %[inter_channel], #0 \n" - "ble loop_1c_%= \n" - // loop 2 channels - "loop_2c_%=: \n" - "vld1.32 {d0-d3}, [%[w_ptr]]! \n" - "vld1.32 {d4-d7}, [%[in_ptr]]! \n" - "vld1.32 {d8-d11}, [%[in_ptr]]! \n" - "vmla.f32 q8, q2, d0[0] \n" - "vmla.f32 q9, q3, d0[0] \n" - "vmla.f32 q10, q2, d0[1] \n" - "vmla.f32 q11, q3, d0[1] \n" - "vmla.f32 q12, q2, d1[0] \n" - "vmla.f32 q13, q3, d1[0] \n" - "vmla.f32 q14, q2, d1[1] \n" - "vmla.f32 q15, q3, d1[1] \n" - - "vmla.f32 q8, q4, d2[0] \n" - "vmla.f32 q9, q5, d2[0] \n" - "vmla.f32 q10, q4, d2[1] \n" - "vmla.f32 q11, q5, d2[1] \n" - "vmla.f32 q12, q4, d3[0] \n" - "vmla.f32 q13, q5, d3[0] \n" - "vmla.f32 q14, q4, d3[1] \n" - "vmla.f32 q15, q5, d3[1] \n" - - "subs %[inter_channel], #1 \n" - "bne loop_2c_%= \n" - - // loop 1 channel - "loop_1c_%=: \n" - "cmp %[remain_channel], #0 \n" - "ble store_res_%= \n" - - "vld1.32 {d0-d1}, [%[w_ptr]]! \n" - "vld1.32 {d4-d7}, [%[in_ptr]]! \n" - "vmla.f32 q8, q2, d0[0] \n" - "vmla.f32 q9, q3, d0[0] \n" - "vmla.f32 q10, q2, d0[1] \n" - "vmla.f32 q11, q3, d0[1] \n" - "vmla.f32 q12, q2, d1[0] \n" - "vmla.f32 q13, q3, d1[0] \n" - "vmla.f32 q14, q2, d1[1] \n" - "vmla.f32 q15, q3, d1[1] \n" - - "store_res_%=: \n" - "vst1.32 {d16-d19}, [%[uv_ptr]]! \n" - "vst1.32 {d20-d23}, [%[uv_ptr]]! \n" - "vst1.32 {d24-d27}, [%[uv_ptr]]! \n" - "vst1.32 {d28-d31}, [%[uv_ptr]]! \n" - : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr), - [inter_channel] "+r"(inter_channel) - : [remain_channel] "r"(remain_channel) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -#endif // __aarch64__ - } - } - } - - /* - * s0 = m0 + (m1 + m2) + (m3 + m4) + 32 * (m5 + m6) - * s1 = (m1 - m2) + 2 * (m3 - m4) + 16 * (m5 - m6) - * s2 = (m1 + m2) + 4 * (m3 + m4) + 8 * (m5 + m6) - * s3 = (m1 - m2) + 8 * (m3 - m4) + 4 * (m5 - m6) - * s4 = (m1 + m2) + 16 * (m3 + m4) + 2 * (m5 + m6) - * s5 = (m1 - m2) + 32 * (m3 - m4) + (m5 - m6) + m7 - */ - int out_h = output->dims()[2]; - int out_w = output->dims()[3]; - int h_tiles = (out_h + 5) / 6; - int w_tiles = (out_w + 5) / 6; - int remain_h = out_h - out_h / 6 * 6; - int remain_w = out_w - out_w / 6 * 6; - float *output_ptr = output->mutable_data(); - float transform_matrix[8] = {2.f, 4.f, 8.f, 16.f}; - - #pragma omp parallel for - for (int oc = 0; oc < output->dims()[1]; ++oc) { - float at_m[48]; // [6][8] - float output_tmp[36]; // [6][6], temporarily restore results - // (oc / 4) * tiles * 64 * 32 + (oc & 0x3) * 8 - const float *uv_ptr = - uv_trans_ptr + (oc >> 2) * tiles * 64 * 32 + (oc & 0x3) * 8; - for (int tile_h = 0; tile_h < h_tiles; ++tile_h) { - for (int tile_w = 0; tile_w < w_tiles; ++tile_w) { - float *at_m_ptr = at_m; - int tile_indics = tile_h * w_tiles + tile_w; - int tile_block = tile_indics >> 3; - int block_indics = tile_indics & 0x7; - const float *uv_ptr0 = uv_ptr + tile_block * 64 * 32 + block_indics; -#if __aarch64__ - float32x4_t _q0 = vld1q_f32(transform_matrix); - for (int l = 0; l < 2; ++l) { - float32x4_t _q1, _q2, _q3, _q4, _q5, _q6, _q7, _q8; - _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 0); - uv_ptr0 += 32; - _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 0); - uv_ptr0 += 32; - _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 0); - uv_ptr0 += 32; - _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 0); - uv_ptr0 += 32; - _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 0); - uv_ptr0 += 32; - _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 0); - uv_ptr0 += 32; - _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 0); - uv_ptr0 += 32; - _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 0); - uv_ptr0 += 32; - - _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 1); - uv_ptr0 += 32; - _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 1); - uv_ptr0 += 32; - _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 1); - uv_ptr0 += 32; - _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 1); - uv_ptr0 += 32; - _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 1); - uv_ptr0 += 32; - _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 1); - uv_ptr0 += 32; - _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 1); - uv_ptr0 += 32; - _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 1); - uv_ptr0 += 32; - - _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 2); - uv_ptr0 += 32; - _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 2); - uv_ptr0 += 32; - _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 2); - uv_ptr0 += 32; - _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 2); - uv_ptr0 += 32; - _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 2); - uv_ptr0 += 32; - _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 2); - uv_ptr0 += 32; - _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 2); - uv_ptr0 += 32; - _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 2); - uv_ptr0 += 32; - - _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 3); - uv_ptr0 += 32; - _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 3); - uv_ptr0 += 32; - _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 3); - uv_ptr0 += 32; - _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 3); - uv_ptr0 += 32; - _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 3); - uv_ptr0 += 32; - _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 3); - uv_ptr0 += 32; - _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 3); - uv_ptr0 += 32; - _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 3); - uv_ptr0 += 32; - - float32x4_t _q9 = vaddq_f32(_q3, _q5); - float32x4_t _q10 = vaddq_f32(_q7, _q2); - float32x4_t _q11 = vaddq_f32(_q4, _q6); - float32x4_t _q12 = vsubq_f32(_q3, _q5); - float32x4_t _q13 = vsubq_f32(_q7, _q2); - float32x4_t _q14 = vsubq_f32(_q4, _q6); - _q2 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0); - _q3 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0); - - float32x4_t _q15 = vaddq_f32(_q1, _q9); - _q15 = vaddq_f32(_q15, _q10); - _q15 = vmlaq_lane_f32(_q15, _q3, vget_high_f32(_q0), 1); - vst1q_f32(at_m_ptr, _q15); - - _q15 = vaddq_f32(_q12, _q2); - _q15 = vmlaq_lane_f32(_q15, _q14, vget_high_f32(_q0), 1); - vst1q_f32(at_m_ptr + 4, _q15); - - _q15 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1); - _q15 = vmlaq_lane_f32(_q15, _q11, vget_high_f32(_q0), 0); - vst1q_f32(at_m_ptr + 8, _q15); - - _q15 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0); - _q15 = vmlaq_lane_f32(_q15, _q14, vget_low_f32(_q0), 1); - vst1q_f32(at_m_ptr + 12, _q15); - - _q15 = vaddq_f32(_q9, _q3); - _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_q0), 1); - vst1q_f32(at_m_ptr + 16, _q15); - - _q15 = vaddq_f32(_q12, _q8); - _q15 = vaddq_f32(_q15, _q14); - _q15 = vmlaq_lane_f32(_q15, _q2, vget_high_f32(_q0), 1); - vst1q_f32(at_m_ptr + 20, _q15); - - at_m_ptr += 24; - } -#else - int steps = 32 * sizeof(float); - asm volatile( - "vld1.32 {d0-d1}, [%[tm_ptr]] \n" - "mov r0, #2 \n" - - "loop_%=: \n" - "vld1.32 {d2[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d6[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d10[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d14[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d4[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d8[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d12[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d16[0]}, [%[uv_ptr0]], %[steps] \n" - - "vld1.32 {d2[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d6[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d10[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d14[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d4[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d8[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d12[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d16[1]}, [%[uv_ptr0]], %[steps] \n" - - "vld1.32 {d3[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d7[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d11[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d15[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d5[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d9[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d13[0]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d17[0]}, [%[uv_ptr0]], %[steps] \n" - - "vld1.32 {d3[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d7[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d11[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d15[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d5[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d9[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d13[1]}, [%[uv_ptr0]], %[steps] \n" - "vld1.32 {d17[1]}, [%[uv_ptr0]], %[steps] \n" - - "vadd.f32 q9, q3, q5 \n" // m1 + m2 - "vadd.f32 q10, q7, q2 \n" // m3 + m4 - "vadd.f32 q11, q4, q6 \n" // m5 + m6 - "vsub.f32 q12, q3, q5 \n" // m1 - m2 - "vsub.f32 q13, q7, q2 \n" // m3 - m4 - "vsub.f32 q14, q4, q6 \n" // m5 - m6 - "vmul.f32 q2, q13, d0[0] \n" // 2 * (m3 - m4) - "vmul.f32 q3, q11, d0[0] \n" // 2 * (m5 + m6) - - "vadd.f32 q15, q1, q9 \n" - "vadd.f32 q15, q15, q10 \n" - "vmla.f32 q15, q3, d1[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vadd.f32 q15, q12, q2 \n" - "vmla.f32 q15, q14, d1[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vmov.32 q15, q9 \n" - "vmla.f32 q15, q10, d0[1] \n" - "vmla.f32 q15, q11, d1[0] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vmov.32 q15, q12 \n" - "vmla.f32 q15, q13, d1[0] \n" - "vmla.f32 q15, q14, d0[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vadd.f32 q15, q9, q3 \n" - "vmla.f32 q15, q10, d1[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vadd.f32 q15, q12, q8 \n" - "vadd.f32 q15, q15, q14 \n" - "vmla.f32 q15, q2, d1[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "subs r0, #1 \n" - "bne loop_%= \n" - : [uv_ptr0] "+r"(uv_ptr0), [at_m_ptr] "+r"(at_m_ptr) - : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); -#endif // __aarch64__ - - float *at_m_ptr0 = at_m; - float *at_m_ptr1 = at_m + 24; - if ((remain_w > 0 && tile_w == w_tiles - 1) || - (remain_h > 0 && tile_h == h_tiles - 1)) { - float *out_ptr0 = output_tmp; - float *out_ptr1 = output_tmp + 6; - float *out_ptr2 = output_tmp + 12; - float *out_ptr3 = output_tmp + 18; - float *out_ptr4 = output_tmp + 24; - float *out_ptr5 = output_tmp + 30; -#if __aarch64__ - float32x4_t _q0 = vld1q_f32(transform_matrix); - float32x4x2_t _q23, _q45, _q67, _q89; - _q23.val[0] = vld1q_f32(at_m_ptr0); - _q23.val[1] = vld1q_f32(at_m_ptr0 + 4); - _q45.val[0] = vld1q_f32(at_m_ptr0 + 8); - _q45.val[1] = vld1q_f32(at_m_ptr0 + 12); - _q67.val[0] = vld1q_f32(at_m_ptr1); - _q67.val[1] = vld1q_f32(at_m_ptr1 + 4); - _q89.val[0] = vld1q_f32(at_m_ptr1 + 8); - _q89.val[1] = vld1q_f32(at_m_ptr1 + 12); - _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]); - _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]); - _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]); - _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]); - float32x4_t _q1 = vcombine_f32(vget_low_f32(_q23.val[0]), - vget_low_f32(_q45.val[0])); - float32x4_t _q3 = vcombine_f32(vget_high_f32(_q23.val[0]), - vget_high_f32(_q45.val[0])); - float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[1]), - vget_low_f32(_q45.val[1])); - float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[1]), - vget_high_f32(_q45.val[1])); - float32x4_t _q5 = vcombine_f32(vget_low_f32(_q67.val[0]), - vget_low_f32(_q89.val[0])); - float32x4_t _q7 = vcombine_f32(vget_high_f32(_q67.val[0]), - vget_high_f32(_q89.val[0])); - float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[1]), - vget_low_f32(_q89.val[1])); - float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[1]), - vget_high_f32(_q89.val[1])); - - float32x4_t _q9 = vaddq_f32(_q2, _q3); - float32x4_t _q10 = vaddq_f32(_q4, _q5); - float32x4_t _q11 = vaddq_f32(_q6, _q7); - float32x4_t _q12 = vsubq_f32(_q2, _q3); - float32x4_t _q13 = vsubq_f32(_q4, _q5); - float32x4_t _q14 = vsubq_f32(_q6, _q7); - _q6 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0); - _q7 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0); - - _q1 = vaddq_f32(_q1, _q9); - _q1 = vaddq_f32(_q1, _q10); - _q1 = vmlaq_lane_f32(_q1, _q7, vget_high_f32(_q0), 1); - - _q2 = vaddq_f32(_q12, _q6); - _q2 = vmlaq_lane_f32(_q2, _q14, vget_high_f32(_q0), 1); - - _q3 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1); - _q3 = vmlaq_lane_f32(_q3, _q11, vget_high_f32(_q0), 0); - - _q4 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0); - _q4 = vmlaq_lane_f32(_q4, _q14, vget_low_f32(_q0), 1); - - _q23 = vtrnq_f32(_q1, _q2); - _q45 = vtrnq_f32(_q3, _q4); - vst1_f32(out_ptr0, vget_low_f32(_q23.val[0])); - vst1_f32(out_ptr0 + 2, vget_low_f32(_q45.val[0])); - vst1_f32(out_ptr1, vget_low_f32(_q23.val[1])); - vst1_f32(out_ptr1 + 2, vget_low_f32(_q45.val[1])); - vst1_f32(out_ptr2, vget_high_f32(_q23.val[0])); - vst1_f32(out_ptr2 + 2, vget_high_f32(_q45.val[0])); - vst1_f32(out_ptr3, vget_high_f32(_q23.val[1])); - vst1_f32(out_ptr3 + 2, vget_high_f32(_q45.val[1])); - - _q1 = vaddq_f32(_q9, _q7); - _q1 = vmlaq_lane_f32(_q1, _q10, vget_high_f32(_q0), 1); - _q2 = vaddq_f32(_q12, _q8); - _q2 = vaddq_f32(_q2, _q14); - _q2 = vmlaq_lane_f32(_q2, _q6, vget_high_f32(_q0), 1); - _q23 = vtrnq_f32(_q1, _q2); - vst1_f32(out_ptr0 + 4, vget_low_f32(_q23.val[0])); - vst1_f32(out_ptr1 + 4, vget_low_f32(_q23.val[1])); - vst1_f32(out_ptr2 + 4, vget_high_f32(_q23.val[0])); - vst1_f32(out_ptr3 + 4, vget_high_f32(_q23.val[1])); - - // remain 2 rows - _q1 = vld1q_f32(at_m_ptr0 + 16); - _q2 = vld1q_f32(at_m_ptr0 + 20); - _q3 = vld1q_f32(at_m_ptr1 + 16); - _q4 = vld1q_f32(at_m_ptr1 + 20); - _q23 = vtrnq_f32(_q1, _q2); - _q45 = vtrnq_f32(_q3, _q4); - - float32x2_t _d2 = vget_low_f32(_q23.val[0]); - float32x2_t _d3 = vget_high_f32(_q23.val[0]); - float32x2_t _d4 = vget_low_f32(_q23.val[1]); - float32x2_t _d5 = vget_high_f32(_q23.val[1]); - float32x2_t _d6 = vget_low_f32(_q45.val[0]); - float32x2_t _d7 = vget_high_f32(_q45.val[0]); - float32x2_t _d8 = vget_low_f32(_q45.val[1]); - float32x2_t _d9 = vget_high_f32(_q45.val[1]); - - float32x2_t _d10 = vadd_f32(_d4, _d3); - float32x2_t _d11 = vadd_f32(_d5, _d6); - float32x2_t _d12 = vadd_f32(_d8, _d7); - float32x2_t _d13 = vsub_f32(_d4, _d3); - float32x2_t _d14 = vsub_f32(_d5, _d6); - float32x2_t _d15 = vsub_f32(_d8, _d7); - float32x2_t _d16 = vmul_lane_f32(_d14, vget_low_f32(_q0), 0); - float32x2_t _d17 = vmul_lane_f32(_d12, vget_low_f32(_q0), 0); - - float32x2_t _d18 = vadd_f32(_d2, _d10); - float32x2_t _d20 = vadd_f32(_d13, _d16); - float32x2_t _d19 = vmla_lane_f32(_d10, _d11, vget_low_f32(_q0), 1); - float32x2_t _d21 = vmla_lane_f32(_d13, _d14, vget_high_f32(_q0), 0); - _d18 = vadd_f32(_d18, _d11); - _d18 = vmla_lane_f32(_d18, _d17, vget_high_f32(_q0), 1); - _d20 = vmla_lane_f32(_d20, _d15, vget_high_f32(_q0), 1); - _d19 = vmla_lane_f32(_d19, _d12, vget_high_f32(_q0), 0); - _d21 = vmla_lane_f32(_d21, _d15, vget_low_f32(_q0), 1); - - float32x2x2_t _d18d20 = vtrn_f32(_d18, _d20); - float32x2x2_t _d19d21 = vtrn_f32(_d19, _d21); - vst1_f32(out_ptr4, _d18d20.val[0]); - vst1_f32(out_ptr4 + 2, _d19d21.val[0]); - vst1_f32(out_ptr5, _d18d20.val[1]); - vst1_f32(out_ptr5 + 2, _d19d21.val[1]); - - _d18 = vadd_f32(_d10, _d17); - _d18 = vmla_lane_f32(_d18, _d11, vget_high_f32(_q0), 1); - _d20 = vadd_f32(_d13, _d9); - _d20 = vadd_f32(_d20, _d15); - _d20 = vmla_lane_f32(_d20, _d16, vget_high_f32(_q0), 1); - _d18d20 = vtrn_f32(_d18, _d20); - vst1_f32(out_ptr4 + 4, _d18d20.val[0]); - vst1_f32(out_ptr5 + 4, _d18d20.val[1]); -#else - asm volatile( - "vld1.32 {d0-d1}, [%[tm_ptr]] \n" - // process 4 rows - "vld1.32 {d2-d5}, [%[at_m_ptr0]]! \n" // q1: m0, q2: m1 - "vld1.32 {d6-d9}, [%[at_m_ptr0]]! \n" // q3: m2, q4: m3 - "vld1.32 {d10-d13}, [%[at_m_ptr1]]! \n" // q5: m4, q6: m5 - "vld1.32 {d14-d17}, [%[at_m_ptr1]]! \n" // q7: m6, q8: m7 - "vtrn.32 q1, q2 \n" - "vtrn.32 q3, q4 \n" - "vtrn.32 q5, q6 \n" - "vtrn.32 q7, q8 \n" - "vswp.32 d3, d6 \n" - "vswp.32 d5, d8 \n" - "vswp.32 d11, d14 \n" - "vswp.32 d13, d16 \n" - - "vadd.f32 q9, q2, q3 \n" // m1 + m2 - "vadd.f32 q10, q4, q5 \n" // m3 + m4 - "vadd.f32 q11, q6, q7 \n" // m5 + m6 - "vsub.f32 q12, q2, q3 \n" // m1 - m2 - "vsub.f32 q13, q4, q5 \n" // m3 - m4 - "vsub.f32 q14, q6, q7 \n" // m5 - m6 - "vmul.f32 q6, q13, d0[0] \n" // 2 * (m3 - m4) - "vmul.f32 q7, q11, d0[0] \n" // 2 * (m5 + m6) - - "vadd.f32 q1, q1, q9 \n" - "vadd.f32 q1, q1, q10 \n" - "vmla.f32 q1, q7, d1[1] \n" - - "vadd.f32 q2, q12, q6 \n" - "vmla.f32 q2, q14, d1[1] \n" - - "vmov.32 q3, q9 \n" - "vmla.f32 q3, q10, d0[1] \n" - "vmla.f32 q3, q11, d1[0] \n" - - "vmov.32 q4, q12 \n" - "vmla.f32 q4, q13, d1[0] \n" - "vmla.f32 q4, q14, d0[1] \n" - - "vtrn.32 q1, q2 \n" - "vtrn.32 q3, q4 \n" - "vswp.32 d3, d6 \n" - "vswp.32 d5, d8 \n" - "vst1.32 {d2-d3}, [%[out_ptr0]]! \n" - "vst1.32 {d4-d5}, [%[out_ptr1]]! \n" - "vst1.32 {d6-d7}, [%[out_ptr2]]! \n" - "vst1.32 {d8-d9}, [%[out_ptr3]]! \n" - - "vadd.f32 q1, q9, q7 \n" - "vmla.f32 q1, q10, d1[1] \n" - - "vadd.f32 q2, q12, q8 \n" - "vadd.f32 q2, q2, q14 \n" - "vmla.f32 q2, q6, d1[1] \n" - - "vtrn.32 q1, q2 \n" - "vst1.32 {d2}, [%[out_ptr0]]! \n" - "vst1.32 {d4}, [%[out_ptr1]]! \n" - "vst1.32 {d3}, [%[out_ptr2]]! \n" - "vst1.32 {d5}, [%[out_ptr3]]! \n" - - // remain 2 rows - "vld1.32 {d2-d5}, [%[at_m_ptr0]]! \n" // d2: m0, d3: m2, - // d4: m1, d5: m3 - "vld1.32 {d6-d9}, [%[at_m_ptr1]]! \n" // d6: m4, d7: m6, - // d8: m5, d9: m7 - "vtrn.32 q1, q2 \n" - "vtrn.32 q3, q4 \n" - - "vadd.f32 d10, d4, d3 \n" // m1 + m2 - "vadd.f32 d11, d5, d6 \n" // m3 + m4 - "vadd.f32 d12, d8, d7 \n" // m5 + m6 - "vsub.f32 d13, d4, d3 \n" // m1 - m2 - "vsub.f32 d14, d5, d6 \n" // m3 - m4 - "vsub.f32 d15, d8, d7 \n" // m5 - m6 - "vmul.f32 d16, d14, d0[0] \n" // 2 * (m3 - m4) - "vmul.f32 d17, d12, d0[0] \n" // 2 * (m5 + m6) - - "vadd.f32 d18, d2, d10 \n" - "vadd.f32 d18, d18, d11 \n" - "vmla.f32 d18, d17, d1[1] \n" - - "vadd.f32 d20, d13, d16 \n" - "vmla.f32 d20, d15, d1[1] \n" - - "vmov.32 d19, d10 \n" - "vmla.f32 d19, d11, d0[1] \n" - "vmla.f32 d19, d12, d1[0] \n" - - "vmov.32 d21, d13 \n" - "vmla.f32 d21, d14, d1[0] \n" - "vmla.f32 d21, d15, d0[1] \n" - - "vtrn.32 d18, d20 \n" - "vtrn.32 d19, d21 \n" - "vst1.32 {d18-d19}, [%[out_ptr4]]! \n" - "vst1.32 {d20-d21}, [%[out_ptr5]]! \n" - - "vadd.f32 d18, d10, d17 \n" - "vmla.f32 d18, d11, d1[1] \n" - - "vadd.f32 d19, d13, d9 \n" - "vadd.f32 d19, d19, d15 \n" - "vmla.f32 d19, d16, d1[1] \n" - - "vtrn.32 d18, d19 \n" - "vst1.32 {d18}, [%[out_ptr4]]! \n" - "vst1.32 {d19}, [%[out_ptr5]]! \n" - : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), - [out_ptr2] "+r"(out_ptr2), [out_ptr3] "+r"(out_ptr3), - [out_ptr4] "+r"(out_ptr4), [out_ptr5] "+r"(out_ptr5), - [at_m_ptr0] "+r"(at_m_ptr0), [at_m_ptr1] "+r"(at_m_ptr1) - : [tm_ptr] "r"((float *)transform_matrix) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -#endif // __aarch64__ - size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w; - float *out_ptr = output_ptr + offset; - int remain_row = out_h - 6 * tile_h; - int remain_col = out_w - 6 * tile_w; - remain_row = (remain_row > 6) ? 6 : remain_row; - remain_col = (remain_col > 6) ? 6 : remain_col; - for (int i = 0; i < remain_row; ++i, out_ptr += out_w) { - memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float)); - } - } else { - size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w; - float *out_ptr0 = output_ptr + offset; - float *out_ptr1 = out_ptr0 + out_w; - float *out_ptr2 = out_ptr1 + out_w; - float *out_ptr3 = out_ptr2 + out_w; - float *out_ptr4 = out_ptr3 + out_w; - float *out_ptr5 = out_ptr4 + out_w; -#if __aarch64__ - float32x4_t _q0 = vld1q_f32(transform_matrix); - float32x4x2_t _q23, _q45, _q67, _q89; - _q23.val[0] = vld1q_f32(at_m_ptr0); - _q23.val[1] = vld1q_f32(at_m_ptr0 + 4); - _q45.val[0] = vld1q_f32(at_m_ptr0 + 8); - _q45.val[1] = vld1q_f32(at_m_ptr0 + 12); - _q67.val[0] = vld1q_f32(at_m_ptr1); - _q67.val[1] = vld1q_f32(at_m_ptr1 + 4); - _q89.val[0] = vld1q_f32(at_m_ptr1 + 8); - _q89.val[1] = vld1q_f32(at_m_ptr1 + 12); - _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]); - _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]); - _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]); - _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]); - float32x4_t _q1 = vcombine_f32(vget_low_f32(_q23.val[0]), - vget_low_f32(_q45.val[0])); - float32x4_t _q3 = vcombine_f32(vget_high_f32(_q23.val[0]), - vget_high_f32(_q45.val[0])); - float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[1]), - vget_low_f32(_q45.val[1])); - float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[1]), - vget_high_f32(_q45.val[1])); - float32x4_t _q5 = vcombine_f32(vget_low_f32(_q67.val[0]), - vget_low_f32(_q89.val[0])); - float32x4_t _q7 = vcombine_f32(vget_high_f32(_q67.val[0]), - vget_high_f32(_q89.val[0])); - float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[1]), - vget_low_f32(_q89.val[1])); - float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[1]), - vget_high_f32(_q89.val[1])); - - float32x4_t _q9 = vaddq_f32(_q2, _q3); - float32x4_t _q10 = vaddq_f32(_q4, _q5); - float32x4_t _q11 = vaddq_f32(_q6, _q7); - float32x4_t _q12 = vsubq_f32(_q2, _q3); - float32x4_t _q13 = vsubq_f32(_q4, _q5); - float32x4_t _q14 = vsubq_f32(_q6, _q7); - _q6 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0); - _q7 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0); - - _q1 = vaddq_f32(_q1, _q9); - _q1 = vaddq_f32(_q1, _q10); - _q1 = vmlaq_lane_f32(_q1, _q7, vget_high_f32(_q0), 1); - _q2 = vaddq_f32(_q12, _q6); - _q2 = vmlaq_lane_f32(_q2, _q14, vget_high_f32(_q0), 1); - _q3 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1); - _q3 = vmlaq_lane_f32(_q3, _q11, vget_high_f32(_q0), 0); - _q4 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0); - _q4 = vmlaq_lane_f32(_q4, _q14, vget_low_f32(_q0), 1); - - _q23 = vtrnq_f32(_q1, _q2); - _q45 = vtrnq_f32(_q3, _q4); - vst1_f32(out_ptr0, vget_low_f32(_q23.val[0])); - vst1_f32(out_ptr0 + 2, vget_low_f32(_q45.val[0])); - vst1_f32(out_ptr1, vget_low_f32(_q23.val[1])); - vst1_f32(out_ptr1 + 2, vget_low_f32(_q45.val[1])); - vst1_f32(out_ptr2, vget_high_f32(_q23.val[0])); - vst1_f32(out_ptr2 + 2, vget_high_f32(_q45.val[0])); - vst1_f32(out_ptr3, vget_high_f32(_q23.val[1])); - vst1_f32(out_ptr3 + 2, vget_high_f32(_q45.val[1])); - - _q1 = vaddq_f32(_q9, _q7); - _q1 = vmlaq_lane_f32(_q1, _q10, vget_high_f32(_q0), 1); - _q2 = vaddq_f32(_q12, _q8); - _q2 = vaddq_f32(_q2, _q14); - _q2 = vmlaq_lane_f32(_q2, _q6, vget_high_f32(_q0), 1); - _q23 = vtrnq_f32(_q1, _q2); - vst1_f32(out_ptr0 + 4, vget_low_f32(_q23.val[0])); - vst1_f32(out_ptr1 + 4, vget_low_f32(_q23.val[1])); - vst1_f32(out_ptr2 + 4, vget_high_f32(_q23.val[0])); - vst1_f32(out_ptr3 + 4, vget_high_f32(_q23.val[1])); - - // remain 2 rows - _q1 = vld1q_f32(at_m_ptr0 + 16); - _q2 = vld1q_f32(at_m_ptr0 + 20); - _q3 = vld1q_f32(at_m_ptr1 + 16); - _q4 = vld1q_f32(at_m_ptr1 + 20); - _q23 = vtrnq_f32(_q1, _q2); - _q45 = vtrnq_f32(_q3, _q4); - - float32x2_t _d2 = vget_low_f32(_q23.val[0]); - float32x2_t _d3 = vget_high_f32(_q23.val[0]); - float32x2_t _d4 = vget_low_f32(_q23.val[1]); - float32x2_t _d5 = vget_high_f32(_q23.val[1]); - float32x2_t _d6 = vget_low_f32(_q45.val[0]); - float32x2_t _d7 = vget_high_f32(_q45.val[0]); - float32x2_t _d8 = vget_low_f32(_q45.val[1]); - float32x2_t _d9 = vget_high_f32(_q45.val[1]); - - float32x2_t _d10 = vadd_f32(_d4, _d3); - float32x2_t _d11 = vadd_f32(_d5, _d6); - float32x2_t _d12 = vadd_f32(_d8, _d7); - float32x2_t _d13 = vsub_f32(_d4, _d3); - float32x2_t _d14 = vsub_f32(_d5, _d6); - float32x2_t _d15 = vsub_f32(_d8, _d7); - float32x2_t _d16 = vmul_lane_f32(_d14, vget_low_f32(_q0), 0); - float32x2_t _d17 = vmul_lane_f32(_d12, vget_low_f32(_q0), 0); - - float32x2_t _d18 = vadd_f32(_d2, _d10); - float32x2_t _d20 = vadd_f32(_d13, _d16); - float32x2_t _d19 = vmla_lane_f32(_d10, _d11, vget_low_f32(_q0), 1); - float32x2_t _d21 = vmla_lane_f32(_d13, _d14, vget_high_f32(_q0), 0); - _d18 = vadd_f32(_d18, _d11); - _d18 = vmla_lane_f32(_d18, _d17, vget_high_f32(_q0), 1); - _d20 = vmla_lane_f32(_d20, _d15, vget_high_f32(_q0), 1); - _d19 = vmla_lane_f32(_d19, _d12, vget_high_f32(_q0), 0); - _d21 = vmla_lane_f32(_d21, _d15, vget_low_f32(_q0), 1); - - float32x2x2_t _d18d20 = vtrn_f32(_d18, _d20); - float32x2x2_t _d19d21 = vtrn_f32(_d19, _d21); - vst1_f32(out_ptr4, _d18d20.val[0]); - vst1_f32(out_ptr4 + 2, _d19d21.val[0]); - vst1_f32(out_ptr5, _d18d20.val[1]); - vst1_f32(out_ptr5 + 2, _d19d21.val[1]); - - _d18 = vadd_f32(_d10, _d17); - _d18 = vmla_lane_f32(_d18, _d11, vget_high_f32(_q0), 1); - _d20 = vadd_f32(_d13, _d9); - _d20 = vadd_f32(_d20, _d15); - _d20 = vmla_lane_f32(_d20, _d16, vget_high_f32(_q0), 1); - _d18d20 = vtrn_f32(_d18, _d20); - vst1_f32(out_ptr4 + 4, _d18d20.val[0]); - vst1_f32(out_ptr5 + 4, _d18d20.val[1]); -#else - asm volatile( - "vld1.32 {d0-d1}, [%[tm_ptr]] \n" - // process 4 rows - "vld1.32 {d2-d5}, [%[at_m_ptr0]]! \n" // q1: m0, q2: m1 - "vld1.32 {d6-d9}, [%[at_m_ptr0]]! \n" // q3: m2, q4: m3 - "vld1.32 {d10-d13}, [%[at_m_ptr1]]! \n" // q5: m4, q6: m5 - "vld1.32 {d14-d17}, [%[at_m_ptr1]]! \n" // q7: m6, q8: m7 - "vtrn.32 q1, q2 \n" - "vtrn.32 q3, q4 \n" - "vtrn.32 q5, q6 \n" - "vtrn.32 q7, q8 \n" - "vswp.32 d3, d6 \n" - "vswp.32 d5, d8 \n" - "vswp.32 d11, d14 \n" - "vswp.32 d13, d16 \n" - - "vadd.f32 q9, q2, q3 \n" // m1 + m2 - "vadd.f32 q10, q4, q5 \n" // m3 + m4 - "vadd.f32 q11, q6, q7 \n" // m5 + m6 - "vsub.f32 q12, q2, q3 \n" // m1 - m2 - "vsub.f32 q13, q4, q5 \n" // m3 - m4 - "vsub.f32 q14, q6, q7 \n" // m5 - m6 - "vmul.f32 q6, q13, d0[0] \n" // 2 * (m3 - m4) - "vmul.f32 q7, q11, d0[0] \n" // 2 * (m5 + m6) - - "vadd.f32 q1, q1, q9 \n" - "vadd.f32 q1, q1, q10 \n" - "vmla.f32 q1, q7, d1[1] \n" - - "vadd.f32 q2, q12, q6 \n" - "vmla.f32 q2, q14, d1[1] \n" - - "vmov.32 q3, q9 \n" - "vmla.f32 q3, q10, d0[1] \n" - "vmla.f32 q3, q11, d1[0] \n" - - "vmov.32 q4, q12 \n" - "vmla.f32 q4, q13, d1[0] \n" - "vmla.f32 q4, q14, d0[1] \n" - - "vtrn.32 q1, q2 \n" - "vtrn.32 q3, q4 \n" - "vswp.32 d3, d6 \n" - "vswp.32 d5, d8 \n" - "vst1.32 {d2-d3}, [%[out_ptr0]]! \n" - "vst1.32 {d4-d5}, [%[out_ptr1]]! \n" - "vst1.32 {d6-d7}, [%[out_ptr2]]! \n" - "vst1.32 {d8-d9}, [%[out_ptr3]]! \n" - - "vadd.f32 q1, q9, q7 \n" - "vmla.f32 q1, q10, d1[1] \n" - - "vadd.f32 q2, q12, q8 \n" - "vadd.f32 q2, q2, q14 \n" - "vmla.f32 q2, q6, d1[1] \n" - - "vtrn.32 q1, q2 \n" - "vst1.32 {d2}, [%[out_ptr0]]! \n" - "vst1.32 {d4}, [%[out_ptr1]]! \n" - "vst1.32 {d3}, [%[out_ptr2]]! \n" - "vst1.32 {d5}, [%[out_ptr3]]! \n" - - // remain 2 rows - "vld1.32 {d2-d5}, [%[at_m_ptr0]]! \n" // d2: m0, d3: m2, - // d4: m1, d5: m3 - "vld1.32 {d6-d9}, [%[at_m_ptr1]]! \n" // d6: m4, d7: m6, - // d8: m5, d9: m7 - "vtrn.32 q1, q2 \n" - "vtrn.32 q3, q4 \n" - - "vadd.f32 d10, d4, d3 \n" // m1 + m2 - "vadd.f32 d11, d5, d6 \n" // m3 + m4 - "vadd.f32 d12, d8, d7 \n" // m5 + m6 - "vsub.f32 d13, d4, d3 \n" // m1 - m2 - "vsub.f32 d14, d5, d6 \n" // m3 - m4 - "vsub.f32 d15, d8, d7 \n" // m5 - m6 - "vmul.f32 d16, d14, d0[0] \n" // 2 * (m3 - m4) - "vmul.f32 d17, d12, d0[0] \n" // 2 * (m5 + m6) - - "vadd.f32 d18, d2, d10 \n" - "vadd.f32 d18, d18, d11 \n" - "vmla.f32 d18, d17, d1[1] \n" - - "vadd.f32 d20, d13, d16 \n" - "vmla.f32 d20, d15, d1[1] \n" - - "vmov.32 d19, d10 \n" - "vmla.f32 d19, d11, d0[1] \n" - "vmla.f32 d19, d12, d1[0] \n" - - "vmov.32 d21, d13 \n" - "vmla.f32 d21, d14, d1[0] \n" - "vmla.f32 d21, d15, d0[1] \n" - - "vtrn.32 d18, d20 \n" - "vtrn.32 d19, d21 \n" - "vst1.32 {d18-d19}, [%[out_ptr4]]! \n" - "vst1.32 {d20-d21}, [%[out_ptr5]]! \n" - - "vadd.f32 d18, d10, d17 \n" - "vmla.f32 d18, d11, d1[1] \n" - - "vadd.f32 d19, d13, d9 \n" - "vadd.f32 d19, d19, d15 \n" - "vmla.f32 d19, d16, d1[1] \n" - - "vtrn.32 d18, d19 \n" - "vst1.32 {d18}, [%[out_ptr4]]! \n" - "vst1.32 {d19}, [%[out_ptr5]]! \n" - : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), - [out_ptr2] "+r"(out_ptr2), [out_ptr3] "+r"(out_ptr3), - [out_ptr4] "+r"(out_ptr4), [out_ptr5] "+r"(out_ptr5), - [at_m_ptr0] "+r"(at_m_ptr0), [at_m_ptr1] "+r"(at_m_ptr1) - : [tm_ptr] "r"((float *)transform_matrix) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -#endif // __aarch64__ - } - } - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif // CONV_OP -#endif // __ARM_NEON__ diff --git a/mobile/src/operators/mul_op.cpp b/mobile/src/operators/mul_op.cpp deleted file mode 100644 index b11f8f95f1..0000000000 --- a/mobile/src/operators/mul_op.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MUL_OP - -#include "mul_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void MulOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - auto y_dims = this->param_.InputY()->dims(); - int x_num_col_dims = this->param_.XNumColDims(); - int y_num_col_dims = this->param_.YNumColDims(); - - assert(x_dims.size() > x_num_col_dims); - assert(y_dims.size() > y_num_col_dims); - - /// (1,2,3,4) , x_num_col_dims = 2 -> (2,12) - auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); - auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); - - assert(x_mat_dims[1] == y_mat_dims[0]); - - std::vector output_dims; - output_dims.reserve( - static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); - - for (int i = 0; i < x_num_col_dims; ++i) { - output_dims.push_back(x_dims[i]); - } - - for (int i = y_num_col_dims; i < y_dims.size(); ++i) { - output_dims.push_back(y_dims[i]); - } - - framework::DDim ddim = framework::make_ddim(output_dims); - this->param_.Out()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(mul, ops::MulOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(mul, ops::MulOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(mul, ops::MulOp); -#endif -#endif diff --git a/mobile/src/operators/mul_op.h b/mobile/src/operators/mul_op.h deleted file mode 100644 index b08cdbf991..0000000000 --- a/mobile/src/operators/mul_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MUL_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/mul_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class MulOp : public framework::OperatorWithKernel< - DeviceType, MulParam, - operators::MulKernel> { - public: - MulOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::MulKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/multiclass_nms_op.cpp b/mobile/src/operators/multiclass_nms_op.cpp deleted file mode 100644 index 1dd7883c8b..0000000000 --- a/mobile/src/operators/multiclass_nms_op.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP - -#include "operators/multiclass_nms_op.h" -namespace paddle_mobile { -namespace operators { - -template -void MultiClassNMSOp::InferShape() const { - auto input_bboxes_dims = this->param_.InputBBoxes()->dims(); - auto input_scores_dims = this->param_.InputScores()->dims(); - if (input_scores_dims.size() != 3) { - LOG(kLOG_ERROR) << "Input Scores size must be 3"; - } - if (input_bboxes_dims[2] % 4 != 0 || input_bboxes_dims[2] < 4) { - LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be multiples of 4"; - } - if (input_bboxes_dims[1] != input_scores_dims[2]) { - LOG(kLOG_ERROR) << "Predict bboxes must be equal"; - } - // pre size, will change in Compute. - this->param_.Out()->Resize( - framework::make_ddim({input_bboxes_dims[1], input_bboxes_dims[2] + 2})); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(multiclass_nms, ops::MultiClassNMSOp); -#endif - -#endif diff --git a/mobile/src/operators/multiclass_nms_op.h b/mobile/src/operators/multiclass_nms_op.h deleted file mode 100644 index bba701d81a..0000000000 --- a/mobile/src/operators/multiclass_nms_op.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MULTICLASSNMS_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/multiclass_nms_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class MultiClassNMSOp : public framework::OperatorWithKernel< - DeviceType, MultiClassNMSParam, - operators::MultiClassNMSKernel> { - public: - MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, MultiClassNMSParam, - operators::MultiClassNMSKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/nearest_interp_op.cpp b/mobile/src/operators/nearest_interp_op.cpp deleted file mode 100644 index 8e6c9b86d6..0000000000 --- a/mobile/src/operators/nearest_interp_op.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NEAREST_INTERP_OP - -#include "operators/nearest_interp_op.h" -#include -namespace paddle_mobile { -namespace operators { -template -void NearestInterpolationOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr, - "Input(X) of BilinearInterOp should not be null."); - PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr, - "Output(Out) of BilinearInterOp should not be null."); - auto dim_x = this->param_.InputX()->dims(); // NCHW format - DLOG << "dim_x :" << dim_x; - - bool ignore_scale = false; - int out_h = this->param_.OutH(); - int out_w = this->param_.OutW(); - if (out_h > 0 && out_w > 0) { - ignore_scale = true; - } - PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4"); - - if (this->param_.InputOutPutSize() != nullptr) { - auto out_size_dim = this->param_.InputOutPutSize()->dims(); - - PADDLE_MOBILE_ENFORCE(out_size_dim.size() == 1, - "OutSize's dimension size must be 1"); - PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2"); - } - - DLOG << "this->param_.HasScale(): " << this->param_.HasScale(); - if (this->param_.HasScale() && !ignore_scale) { - const float scale = this->param_.Scale(); - DLOG << "scale_: " << scale; - std::vector dim_out({dim_x[0], dim_x[1], - static_cast(dim_x[2] * scale), - static_cast(dim_x[3] * scale)}); - this->param_.Out()->Resize(framework::make_ddim(dim_out)); - DLOG << "interp -- dim_out: " << dim_out; - - } else { - std::vector dim_out({dim_x[0], dim_x[1], out_h, out_w}); - this->param_.Out()->Resize(framework::make_ddim(dim_out)); - DLOG << "interp -- dim_out: " << dim_out; - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(nearest_interp, ops::NearestInterpolationOp); -#endif - -#if PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(nearest_interp, ops::NearestInterpolationOp) -#endif - -#endif diff --git a/mobile/src/operators/nearest_interp_op.h b/mobile/src/operators/nearest_interp_op.h deleted file mode 100644 index 130de53231..0000000000 --- a/mobile/src/operators/nearest_interp_op.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NEAREST_INTERP_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/nearest_interp_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class NearestInterpolationOp - : public framework::OperatorWithKernel< - DeviceType, NearestInterpolationParam, - operators::NearestInterpolationKernel> { - public: - NearestInterpolationOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, NearestInterpolationParam, - operators::NearestInterpolationKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/norm_op.cpp b/mobile/src/operators/norm_op.cpp deleted file mode 100644 index 5541755eb0..0000000000 --- a/mobile/src/operators/norm_op.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NORM_OP - -#include "operators/norm_op.h" -#include "framework/op_proto_maker.h" -#include "framework/op_registry.h" - -namespace paddle_mobile { -namespace operators { - -template -void NormOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - this->param_.Out()->Resize(x_dims); - - int axis = this->param_.Axis(); - if (axis < 0) { - axis += x_dims.size(); - } - x_dims[axis] = 1; - this->param_.OutputNorm()->Resize(x_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(norm, ops::NormOp); -#endif - -#ifdef PADDLE_MOBILE_FPGA -#endif - -#ifdef PADDLE_MOBILE_CL -#endif - -#endif diff --git a/mobile/src/operators/norm_op.h b/mobile/src/operators/norm_op.h deleted file mode 100644 index 64d8e7c3cc..0000000000 --- a/mobile/src/operators/norm_op.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef NORM_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/norm_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class NormOp - : public framework::OperatorWithKernel, - NormKernel> { - public: - NormOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - NormKernel>( - type, inputs, outputs, attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/one_hot_op.cpp b/mobile/src/operators/one_hot_op.cpp deleted file mode 100644 index 64fcc64785..0000000000 --- a/mobile/src/operators/one_hot_op.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ONE_HOT_OP - -#include "operators/one_hot_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void OnehotOp::InferShape() const { - const auto &x_dims = this->param_.input_->dims(); - int depth = this->param_.depth_; - framework::DDim out_dims(x_dims); - out_dims[out_dims.size() - 1] = depth; - this->param_.output_->Resize(out_dims); - if (std::is_same, Dtype>::value) { - this->param_.output_->set_lod(this->param_.input_->lod()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(one_hot, ops::OnehotOp); -#endif - -#endif // ONE_HOT_OP diff --git a/mobile/src/operators/one_hot_op.h b/mobile/src/operators/one_hot_op.h deleted file mode 100644 index 4b7e83bf99..0000000000 --- a/mobile/src/operators/one_hot_op.h +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef ONE_HOT_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/one_hot_kernel.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(Onehot, OnehotParam, OnehotKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif // ONE_HOT_OP diff --git a/mobile/src/operators/op_param.cpp b/mobile/src/operators/op_param.cpp deleted file mode 100644 index bccff4a274..0000000000 --- a/mobile/src/operators/op_param.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/op_param.h" -namespace paddle_mobile { -namespace operators { - -#ifdef CONV_OP -template <> -Print &operator<<(Print &printer, const ConvParam &conv_param) { - printer << "parameter of conv: " - << "\n"; - printer << " stride: " - << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") " - << "\n"; - printer << " paddings: " - << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1] - << ") " - << "\n"; - printer << " dilations: " - << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1] - << ") " - << "\n"; - printer << " groups: " << conv_param.Groups() << "\n"; - printer << " input dims: " << conv_param.Input()->dims() << "\n"; - printer << " filter dims: " << conv_param.Filter()->dims() << "\n"; - printer << " output dims: " << conv_param.Output()->dims(); - return printer; -} - -template class ConvParam; -template class ConvParam; -#endif - -#ifdef ELEMENTWISEADD_OP -template class ElementwiseAddParam; -template class ElementwiseAddParam; -#endif - -#ifdef ELEMENTWISEMUL_OP -template class ElementwiseMulParam; -template class ElementwiseMulParam; -#endif - -#ifdef MUL_OP -template class MulParam; -template class MulParam; -#endif - -#ifdef CONCAT_OP -template class ConcatParam; -template class ConcatParam; -#endif - -#ifdef LRN_OP -template class LrnParam; -template class LrnParam; -#endif - -#ifdef FUSION_CONVADD_OP - -Print &operator<<(Print &printer, const FusionConvAddParam &conv_param) { - printer << "parameter of conv_add: " - << "\n"; - printer << " stride: " - << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") " - << "\n"; - printer << " paddings: " - << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1] - << ") " - << "\n"; - printer << " dilations: " - << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1] - << ") " - << "\n"; - printer << " groups: " << conv_param.Groups() << "\n"; - printer << " input dims: " << conv_param.Input()->dims() << "\n"; - printer << " filter dims: " << conv_param.Filter()->dims() << "\n"; - printer << " bias dims: " << conv_param.Bias()->dims() << "\n"; - printer << " output dims: " << conv_param.Output()->dims(); - return printer; -} - -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h deleted file mode 100644 index 8ef339e82e..0000000000 --- a/mobile/src/operators/op_param.h +++ /dev/null @@ -1,3816 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "common/log.h" -#include "common/type_define.h" -#include "common/types.h" -#include "framework/attribute.h" -#include "framework/lod_tensor.h" -#include "framework/scope.h" -#include "framework/tensor.h" -#include "framework/type_trait.h" -#include "framework/variable.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif - -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -#ifdef PADDLE_MOBILE_FPGA_KD -#include "fpga/KD/context.hpp" -#endif - -#ifdef PADDLE_MOBILE_CL -#include "framework/cl/cl_image.h" -#endif - -namespace paddle_mobile { -namespace operators { - -using framework::Attribute; -using framework::AttributeMap; -using framework::LoDTensor; -using framework::Scope; -using framework::Tensor; -using framework::Variable; -using std::string; -using std::vector; - -using framework::DtypeTensorTrait; - -template -class CLImageDeleter { - typedef typename DtypeTensorTrait::gtype GType; - - public: - void operator()(GType *ptr) { -#ifdef PADDLE_MOBILE_CL - framework::CLImage *image = dynamic_cast(ptr); - if (image) { - delete image; - } -#endif - } -}; - -class OpParam { - public: - OpParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : scope_(scope) {} - - Scope *GetScope() const { return scope_; } - Scope *scope_ = nullptr; - -#ifdef PADDLE_MOBILE_FPGA_KD - zynqmp::Context &context() { return context_; } - - zynqmp::Context context_; -#endif - - protected: - template - static T *InputH0From(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("H0", inputs, scope); - } - - template - static T *InputHiddenPrevFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetVarValue("HiddenPrev", inputs, scope); - } - - template - static T *InputAlphaFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Alpha", inputs, scope); - } - - template - static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Input", inputs, scope); - } - - template - static T *InputXFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("X", inputs, scope); - } - template - static T *InputOutSizeFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetVarValue("OutSize", inputs, scope); - } - - template - static T *InputWFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("W", inputs, scope); - } - - template - static T *InputIdsFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Ids", inputs, scope); - } - - template - static T *InputEmissionFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetVarValue("Emission", inputs, scope); - } - - template - static T *InputTransitionFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetVarValue("Transition", inputs, scope); - } - template - static T *InputLabelFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Label", inputs, scope); - } - - template - static T *InputXFrom1(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue1("addX", inputs, scope); - } - - template - static T *InputYFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Y", inputs, scope); - } - - template - static T *InputYFrom1(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue1("Y", inputs, scope); - } - - template - static T *InputZFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Z", inputs, scope); - } - - template - static T *InputBiasFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Bias", inputs, scope); - } - template - static T *InputWeightFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Weight", inputs, scope); - } - template - static T *InputVarianceFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetVarValue("Variance", inputs, scope); - } - template - static T *InputMeanFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Mean", inputs, scope); - } - template - static T *InputScaleFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Scale", inputs, scope); - } - template - static T *InputImageFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Image", inputs, scope); - } - template - static T *InputPriorBoxFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetVarValue("PriorBox", inputs, scope); - } - template - static T *InputPriorBoxVarFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetVarValue("PriorBoxVar", inputs, scope); - } - // LoDTensor but now use Tensor - template - static T *InputTargetBoxFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetVarValue("TargetBox", inputs, scope); - } - - template - static T *InputBBoxesFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("BBoxes", inputs, scope); - } - - template - static T *InputScoresFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Scores", inputs, scope); - } - - template - static T *InputShapeFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Shape", inputs, scope); - } - - template - static vector InputMultiFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetMultiVarValue("X", inputs, scope); - } - - static vector InputMultiVarsFrom(const VariableNameMap &inputs, - const Scope &scope) { - return GetMultiVar("X", inputs, scope); - } - - template - static T *OutputBatchGateFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("BatchGate", outputs, scope); - } - - template - static T *OutputGateFrom(const VariableNameMap &outputs, const Scope &scope) { - return GetVarValue("Gate", outputs, scope); - } - - template - static T *OutputViterbiPathFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("ViterbiPath", outputs, scope); - } - template - static T *OutputBatchResetHiddenPrevFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("BatchResetHiddenPrev", outputs, scope); - } - - template - static T *OutputResetHiddenPrevFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("ResetHiddenPrev", outputs, scope); - } - - template - static T *OutputBatchHiddenFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("BatchHidden", outputs, scope); - } - - template - static T *OutputHiddenFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("Hidden", outputs, scope); - } - - template - static T *OutputFrom(const VariableNameMap &outputs, const Scope &scope) { - return GetVarValue("Output", outputs, scope); - } - - static Variable *OutVarFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVar("Out", outputs, scope); - } - - template - static T *OutFrom(const VariableNameMap &outputs, const Scope &scope) { - return GetVarValue("Out", outputs, scope); - } - - template - static vector OutMultiFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetMultiVarValue("Out", outputs, scope); - } - - template - static T *OutputYFrom(const VariableNameMap &outputs, const Scope &scope) { - return GetVarValue("Y", outputs, scope); - } - - template - static T *OutputXShapeFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("XShape", outputs, scope); - } - - template - static T *OutputBoxesFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("Boxes", outputs, scope); - } - - template - static T *OutputBoxFrom(const VariableNameMap &outputs, const Scope &scope) { - return GetVarValue("OutputBox", outputs, scope); - } - - template - static T *OutputNormFrom(const VariableNameMap &outputs, const Scope &scope) { - return GetVarValue("Norm", outputs, scope); - } - - template - static T *OutputVariancesFrom(const VariableNameMap &outputs, - const Scope &scope) { - return GetVarValue("Variances", outputs, scope); - } - - template - static T *MidOutFrom(const VariableNameMap &outputs, const Scope &scope) { - return GetVarValue("MidOut", outputs, scope); - } - - template - static T *FilterFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Filter", inputs, scope); - } - - template - static T *GridFrom(const VariableNameMap &inputs, const Scope &scope) { - return GetVarValue("Grid", inputs, scope); - } - - template - static const T GetAttr(const string &key, const AttributeMap &map) { - PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map", - key.c_str()) - return ((Attribute)map.at(key)).Get(); - } - static const std::string GetStringAttr(const string &key, - const AttributeMap &map) { - PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map", - key.c_str()) - return ((Attribute)map.at(key)).GetString(); - } - - static const bool HasAttr(const string &key, const AttributeMap &map) { - return map.count(key) > 0; - } - - static const bool HasVar(const string &key, const VariableNameMap &var_map) { - return var_map.count(key) > 0; - } - - template - static T *GetVarValue(const string &key, const VariableNameMap &var_map, - const Scope &scope) { - PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0, - "%s is not contained in var_map", key.c_str()) - auto var_vec = var_map.at(key); - if (!var_vec.empty()) { - auto var = scope.FindVar(var_vec[0]); - return var->GetMutable(); - } else { - return nullptr; - } - } - - static Variable *GetVar(const string &key, const VariableNameMap &var_map, - const Scope &scope) { - PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0, - "%s is not contained in var_map", key.c_str()) - auto var_vec = var_map.at(key); - if (!var_vec.empty()) { - auto var = scope.FindVar(var_vec[0]); - return var; - } else { - return nullptr; - } - } - - static std::string Getkey(const string &key, const VariableNameMap &var_map, - int index) { - PADDLE_MOBILE_ENFORCE(var_map.count(key) > index, - "%s is not contained in var_map", key.c_str()) - auto var_vec = var_map.at(key); - return var_vec[index]; - } - - template - static T *GetVarValue1(const string &key, const VariableNameMap &var_map, - const Scope &scope) { - PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0, - "%s is not contained in var_map", key.c_str()) - auto var_vec = var_map.at(key); - if (!var_vec.empty()) { - auto var = scope.FindVar(var_vec[1]); - return var->GetMutable(); - } else { - return nullptr; - } - } - - template - static vector GetMultiVarValue(const string &key, - const VariableNameMap &var_map, - const Scope &scope) { - auto var_vecs = var_map.at(key); - assert(var_vecs.size() > 1); - vector var_res; - for (auto &var_vec : var_vecs) { - auto var = scope.FindVar(var_vec); - var_res.push_back(var->GetMutable()); - } - return var_res; - } - - static vector GetMultiVar(const string &key, - const VariableNameMap &var_map, - const Scope &scope) { - auto var_vecs = var_map.at(key); - assert(var_vecs.size() > 1); - vector var_res; - for (auto &var_vec : var_vecs) { - auto var = scope.FindVar(var_vec); - var_res.push_back(var); - } - return var_res; - } -}; - -#define GET_VAR_AS_TENSOR(name, name_dict, scope) \ - OpParam::GetVarValue(name, name_dict, scope) - -#define GET_VAR_AS_LOD_TENSOR(name, name_dict, scope) \ - OpParam::GetVarValue(name, name_dict, scope) - -template -class ConvParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - filter_ = OpParam::FilterFrom(inputs, *scope); - input_ = OpParam::InputFrom(inputs, *scope); - if (outputs.count("Output")) { - output_ = OpParam::OutputFrom(outputs, *scope); - } - strides_ = OpParam::GetAttr>("strides", attrs); - paddings_ = OpParam::GetAttr>("paddings", attrs); - dilations_ = OpParam::GetAttr>("dilations", attrs); - groups = OpParam::GetAttr("groups", attrs); - } - - const GType *Input() const { return input_; } - - GType *Filter() const { return filter_; } - - GType *Output() const { return output_; } - - const vector &Strides() const { return strides_; } - - const vector &Paddings() const { return paddings_; } - - const vector &Dilations() const { return dilations_; } - - enum ExecMode { - EXEC_INVALID = 0, - EXEC_GEMM_FLOAT, - EXEC_DEPTHWISE3x3S1_FLOAT, - EXEC_DEPTHWISE3x3S2_FLOAT, - EXEC_WINOGRAD3X3_FLOAT, - EXEC_WINOGRAD5X5_FLOAT, - EXEC_DEPTHWISE5x5_FLOAT, - EXEC_GEMM_INT8, - EXEC_DEPTHWISE3x3_INT8, - EXEC_DEPTHWISE5x5_INT8, - EXEC_SLIDINGWINDOW3x3S1_FLOAT, - EXEC_SLIDINGWINDOW3x3S2_FLOAT, - EXEC_DEPTHWISE3x3_FLOAT, - EXEC_SLIDINGWINDOW1x1_FLOAT, - EXEC_SLIDINGWINDOW3x3_FLOAT, - EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT, - EXEC_SLIDINGWINDOW5x5_FLOAT, - EXEC_SLIDINGWINDOW7x7_FLOAT, - EXEC_GEMM1x1s1_FLOAT, - EXEC_DEPTHWISEBASIC_FLOAT, - }; - - ExecMode &ExecMode() const { return exec_mode_; } - - const int &Groups() const { return groups; } - -#ifdef PADDLE_MOBILE_CL - int Offset() const { return offset_; } - - int SetOffset(int in_offset) { offset_ = in_offset; } - -#endif - - public: - GType *input_; - GType *output_; - GType *filter_; - GType *transformed_filter_; - vector strides_; - vector paddings_; - vector dilations_; - mutable enum ExecMode exec_mode_; - int groups; - -#ifdef PADDLE_MOBILE_CL - int offset_; -#endif - -#ifdef PADDLE_MOBILE_FPGA - - public: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } - - public: - fpga::DWconvArgs fpga_dwconv_args; - - public: - const fpga::DWconvArgs &FpgaDwconvArgs() const { return fpga_dwconv_args; } - void SetFpgaArgs(const fpga::DWconvArgs &args) { fpga_dwconv_args = args; } -#endif -}; -template -Print &operator<<(Print &printer, const ConvParam &conv_param); - -template -class ElementwiseAddParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ElementwiseAddParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_y_ = InputYFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - axis_ = GetAttr("axis", attrs); - } - - const GType *InputX() const { return input_x_; } - - const GType *InputY() const { return input_y_; } - - GType *Out() const { return out_; } - - const int &Axis() const { return axis_; } - - private: - GType *input_x_; - GType *input_y_; - GType *out_; - int axis_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::EWAddArgs fpga_EW_add_args; - - public: - const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; } - void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; } - - public: - Tensor float_input_x, float_out; - -#endif -}; - -#ifdef ELEMENTWISEMUL_OP -template -class ElementwiseMulParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ElementwiseMulParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_y_ = InputYFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - axis_ = GetAttr("axis", attrs); - } - - const GType *InputX() const { return input_x_; } - - const GType *InputY() const { return input_y_; } - - GType *Out() const { return out_; } - - const int &Axis() const { return axis_; } - - private: - GType *input_x_; - GType *input_y_; - GType *out_; - int axis_; -#ifdef PADDLE_MOBILE_FPGA - - public: - Tensor float_input_x, float_out; - -#endif -}; -#endif - -#ifdef FUSION_ELEMENTWISEADDRELU_OP -template -using ElementwiseAddReluParam = ElementwiseAddParam; -#endif - -#ifdef ELEMENTWISESUB_OP -template -class ElementwiseSubParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ElementwiseSubParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_y_ = InputYFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - axis_ = GetAttr("axis", attrs); - } - - const GType *InputX() const { return input_x_; } - - const GType *InputY() const { return input_y_; } - - GType *Out() const { return out_; } - - const int &Axis() const { return axis_; } - - private: - GType *input_x_; - GType *input_y_; - GType *out_; - int axis_; -}; -#endif - -#ifdef MUL_OP -template -class MulParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_y_ = InputYFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - x_num_col_dims_ = GetAttr("x_num_col_dims", attrs); - y_num_col_dims_ = GetAttr("y_num_col_dims", attrs); - } - - GType *InputX() const { return input_x_; } - - GType *InputY() const { return input_y_; } - - GType *Out() const { return out_; } - - const int &XNumColDims() const { return x_num_col_dims_; } - - const int &YNumColDims() const { return y_num_col_dims_; } - - private: - GType *input_x_; - GType *input_y_; - GType *out_; - int x_num_col_dims_; - int y_num_col_dims_; -}; -#endif - -#ifdef CONCAT_OP -template -class ConcatParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - inputs_ = InputMultiFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - axis_ = GetAttr("axis", attrs); - original_output_dims_size_ = out_->dims().size(); - } - - vector Inputs() const { return inputs_; } - - GType *Out() const { return out_; } - - const int &Axis() const { return axis_; } - - public: - vector inputs_; - GType *out_; - int axis_; - int original_output_dims_size_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::ConcatArgs fpga_concat_args; - - public: - const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; } - void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; } -#endif -}; -#endif - -#ifdef SUM_OP -template -class SumParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - SumParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - inputs_vars_ = InputMultiVarsFrom(inputs, *scope); - out_var_ = OutVarFrom(outputs, *scope); - inputs_ = InputMultiFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - } - - vector InputsVars() const { return inputs_vars_; } - - Variable *OutVar() const { return out_var_; } - - vector Inputs() const { return inputs_; } - - GType *Out() const { return out_; } - - private: - vector inputs_vars_; - Variable *out_var_; - vector inputs_; - GType *out_; -}; -#endif - -#ifdef LRN_OP -template -class LrnParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - mid_out_ = MidOutFrom(outputs, *scope); - n_ = GetAttr("n", attrs); - alpha_ = GetAttr("alpha", attrs); - beta_ = GetAttr("beta", attrs); - k_ = GetAttr("k", attrs); - data_format_ = GetStringAttr("data_format", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - GType *MidOut() const { return mid_out_; } - - const int &N() const { return n_; } - - const float &Alpha() const { return alpha_; } - - const float &Beta() const { return beta_; } - - const float &K() const { return k_; } - - const string &DataFormat() const { return data_format_; } - - private: - GType *input_x_; - GType *out_; - GType *mid_out_; - int n_; - float alpha_; - float beta_; - float k_; - string data_format_; -}; -#endif - -#ifdef NORM_OP -template -class NormParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - NormParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - output_norm_ = OutputNormFrom(outputs, *scope); - epsilon_ = GetAttr("epsilon", attrs); - axis_ = GetAttr("axis", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - GType *OutputNorm() const { return output_norm_; } - - const float &Epsilon() const { return epsilon_; } - - const int &Axis() const { return axis_; } - - private: - GType *input_x_; - GType *out_; - GType *output_norm_; - float epsilon_; - int axis_; -}; -#endif - -#ifdef BATCHNORM_OP -template -class BatchNormParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - output_y_ = OutputYFrom(outputs, *scope); - input_bias_ = InputBiasFrom(inputs, *scope); - input_mean_ = InputMeanFrom(inputs, *scope); - input_scale_ = InputScaleFrom(inputs, *scope); - input_variance_ = InputVarianceFrom(inputs, *scope); - epsilon_ = GetAttr("epsilon", attrs); - momentum_ = GetAttr("momentum", attrs); - // is_test_ = GetAttr("is_test", attrs); - } - - ~BatchNormParam() {} - - const GType *InputX() const { return input_x_; } - - GType *OutputY() const { return output_y_; } - - const GType *InputBias() const { return input_bias_; } - - const GType *InputMean() const { return input_mean_; } - - const GType *InputScale() const { return input_scale_; } - - const GType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - const bool &IsTest() const { return is_test_; } - - const string &DataFormat() const { return data_format_; } - - void SetNewScale(GType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(GType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const GType *NewScale() const { return new_scale_.get(); } - - const GType *NewBias() const { return new_bias_.get(); } - - private: - GType *input_x_; - GType *output_y_; - GType *input_bias_; - GType *input_mean_; - GType *input_scale_; - GType *input_variance_; - float epsilon_; - float momentum_; - bool is_test_; - string data_format_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; -#endif - -#ifdef INSTANCENORM_OP -template -class InstanceNormParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - InstanceNormParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - output_y_ = OutputYFrom(outputs, *scope); - epsilon_ = GetAttr("epsilon", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *OutputY() const { return output_y_; } - - const float &Epsilon() const { return epsilon_; } - - private: - GType *input_x_; - GType *output_y_; - float epsilon_; -}; -#endif - -#ifdef FUSION_INSTANCENORM_RELU_OP -template -class FusionInstanceNormReluParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionInstanceNormReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - epsilon_ = GetAttr("epsilon", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - const float &Epsilon() const { return epsilon_; } - - private: - GType *input_x_; - GType *out_; - float epsilon_; -}; -#endif - -#ifdef POOL_OP -template -class PoolParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputXFrom(inputs, *scope); - - output_ = OutFrom(outputs, *scope); - pooling_type_ = GetStringAttr("pooling_type", attrs); - ksize_ = GetAttr>("ksize", attrs); - strides_ = GetAttr>("strides", attrs); - paddings_ = GetAttr>("paddings", attrs); - ceil_mode_ = GetAttr("ceil_mode", attrs); - global_pooling_ = GetAttr("global_pooling", attrs); - - if (HasAttr("exclusive", attrs)) { - exclusive_ = GetAttr("exclusive", attrs); - } else { - exclusive_ = true; - } - } - - const GType *Input() const { return input_; } - - GType *Output() const { return output_; } - - const string &PoolingType() const { return pooling_type_; } - - const vector &Ksize() const { return ksize_; } - - const vector &Strides() const { return strides_; } - - const vector &Paddings() const { return paddings_; } - - bool isCeilMode() const { return ceil_mode_; } - - bool isGlobalPooling() const { return global_pooling_; } - - bool isExclusive() const { return exclusive_; } - - private: - GType *input_; - GType *output_; - string pooling_type_; - vector ksize_; - vector strides_; - vector paddings_; - bool ceil_mode_; - bool global_pooling_ = false; - bool exclusive_ = true; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::PoolingArgs fpga_pool_args; - - public: - const fpga::PoolingArgs &FpgaArgs() const { return fpga_pool_args; } - void SetFpgaArgs(const fpga::PoolingArgs &args) { fpga_pool_args = args; } -#endif -}; -#endif - -#ifdef PRIORBOX_OP -template -class PriorBoxParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputFrom(inputs, *scope); - input_image_ = InputImageFrom(inputs, *scope); - output_boxes_ = OutputBoxesFrom(outputs, *scope); - output_variances_ = OutputVariancesFrom(outputs, *scope); - min_sizes_ = GetAttr>("min_sizes", attrs); - max_sizes_ = GetAttr>("max_sizes", attrs); - aspect_ratios_ = GetAttr>("aspect_ratios", attrs); - variances_ = GetAttr>("variances", attrs); - - if (HasAttr("min_max_aspect_ratios_order", attrs)) { - min_max_aspect_ratios_order_ = - GetAttr("min_max_aspect_ratios_order", attrs); - } else { - min_max_aspect_ratios_order_ = false; - } - flip_ = GetAttr("flip", attrs); - clip_ = GetAttr("clip", attrs); - step_w_ = GetAttr("step_w", attrs); - step_h_ = GetAttr("step_h", attrs); - offset_ = GetAttr("offset", attrs); - } - const GType *Input() const { return input_; } - - const GType *InputImage() const { return input_image_; } - - GType *OutputBoxes() const { return output_boxes_; } - - GType *OutputVariances() const { return output_variances_; } - - const vector &MinSizes() const { return min_sizes_; } - - const vector &MaxSizes() const { return max_sizes_; } - - const vector &AspectRatios() const { return aspect_ratios_; } - - const vector &Variances() const { return variances_; } - - const bool &Flip() const { return flip_; } - - const bool &Clip() const { return clip_; } - - const float &StepW() const { return step_w_; } - - const float &StepH() const { return step_h_; } - - const float &Offset() const { return offset_; } - - const bool &MinMaxAspectRatiosOrder() const { - return min_max_aspect_ratios_order_; - } - - private: - GType *input_; - GType *input_image_; - GType *output_boxes_; - GType *output_variances_; - vector min_sizes_; - vector max_sizes_; - vector aspect_ratios_; - vector variances_; - bool flip_; - bool clip_; - float step_w_; - float step_h_; - float offset_; - bool min_max_aspect_ratios_order_; -}; -#endif - -#ifdef BOXCODER_OP -template -class BoxCoderParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_priorbox_ = InputPriorBoxFrom(inputs, *scope); - input_priorboxvar_ = InputPriorBoxVarFrom(inputs, *scope); - input_targetbox_ = InputTargetBoxFrom(inputs, *scope); - output_box_ = OutputBoxFrom(outputs, *scope); - code_type_ = GetStringAttr("code_type", attrs); - } - const GType *InputPriorBox() const { return input_priorbox_; } - - const GType *InputPriorBoxVar() const { return input_priorboxvar_; } - - const GType *InputTargetBox() const { return input_targetbox_; } - - GType *OutputBox() const { return output_box_; } - - const std::string &CodeType() const { return code_type_; } - - private: - GType *input_priorbox_; - GType *input_priorboxvar_; - GType *input_targetbox_; - GType *output_box_; - std::string code_type_; -}; -#endif - -#ifdef SOFTMAX_OP -template -class SoftmaxParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - if (HasAttr("axis", attrs)) { - axis_ = GetAttr("axis", attrs); - has_axis_ = true; - } - } - const GType *InputX() const { return input_x_; } - GType *Out() const { return out_; } - - int axis_ = -1; - bool has_axis_ = false; - - private: - GType *input_x_; - GType *out_; - -#ifdef PADDLE_MOBILE_FPGA - -#ifdef PADDLE_MOBILE_FPGA_V1 - - private: - std::shared_ptr float_input_x_; - fpga::BypassArgs fpga_bypass_args; - - public: - GType *FloatInput() const { - return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); - } - void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); } - const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } - void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } -#else - - private: - fpga::BypassArgs fpga_bypass_args; - - public: - const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } - void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } - - public: - std::shared_ptr float_input_x_, float_out; -#endif -#endif -}; -#endif - -#ifdef SIGMOID_OP -template -class SigmoidParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - } - const GType *InputX() const { return input_x_; } - GType *Out() const { return out_; } - - private: - GType *input_x_; - GType *out_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::BypassArgs fpga_bypass_args; - - public: - const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } - void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } -#endif -}; -#endif - -#ifdef MULTICLASSNMS_OP -template -class MultiClassNMSParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - MultiClassNMSParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_bboxes_ = InputBBoxesFrom(inputs, *scope); - input_scores_ = InputScoresFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - background_label_ = GetAttr("background_label", attrs); - nms_top_k_ = GetAttr("nms_top_k", attrs); - keep_top_k_ = GetAttr("keep_top_k", attrs); - nms_threshold_ = GetAttr("nms_threshold", attrs); - nms_eta_ = GetAttr("nms_eta", attrs); - score_threshold_ = GetAttr("score_threshold", attrs); - } - - GType *InputBBoxes() const { return input_bboxes_; } - - GType *InputScores() const { return input_scores_; } - - GType *Out() const { return out_; } - - const int &BackGroundLabel() const { return background_label_; } - - const int &NMSTopK() const { return nms_top_k_; } - - const int &KeepTopK() const { return keep_top_k_; } - - const float &NMSThreshold() const { return nms_threshold_; } - - const float &NMSEta() const { return nms_eta_; } - - const float &ScoreThreshold() const { return score_threshold_; } - - private: - GType *input_bboxes_; - GType *input_scores_; - GType *out_; - int background_label_; - int nms_top_k_; - int keep_top_k_; - float nms_threshold_; - float nms_eta_; - float score_threshold_; -}; -#endif - -#ifdef POLYGONBOXTRANSFORM_OP -template -class PolygonBoxTransformParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - PolygonBoxTransformParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputFrom(inputs, *scope); - output_ = OutputFrom(outputs, *scope); - } - const GType *Input() const { return input_; } - GType *Output() const { return output_; } - - private: - GType *input_; - GType *output_; -}; -#endif - -template -class FeedParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom>(inputs, *scope); - out_ = OutFrom(outputs, *scope); - col_ = GetAttr("col", attrs); - auto var = scope->FindVar("batch_size"); - batch_size = var->GetValue(); - } - const std::vector *InputX() const { return input_x_; } - GType *Out() const { return out_; } - const int Col() const { return col_; } - const int BatchSize() const { return batch_size; } - - private: - std::vector *input_x_; - GType *out_; - int col_; - int batch_size; -}; - -template -class FetchParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom>(outputs, *scope); - col_ = GetAttr("col", attrs); - } - - const GType *InputX() const { return input_x_; } - std::vector *Out() const { return out_; } - const int Col() const { return col_; } - - private: - GType *input_x_; - std::vector *out_; - int col_; -#ifdef PADDLE_MOBILE_FPGA - - public: -#ifdef PADDLE_MOBILE_FPGA_V1 - fpga::BypassArgs fpga_bypass_args; - Tensor aligned_out; -#else - std::shared_ptr aligned_out; -#endif -#endif -}; - -#ifdef FILL_CONSTANT_OP -template -class FillConstantParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FillConstantParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - out_var_ = OutVarFrom(outputs, *scope); - out_ = OutFrom(outputs, *scope); - dtype_ = GetAttr("dtype", attrs); - shape_ = GetAttr>("shape", attrs); - value_ = GetAttr("value", attrs); - } - - Variable *OutVar() const { return out_var_; } - - GType *Out() const { return out_; } - - const int &DataDtype() const { return dtype_; } - - const vector &Shape() const { return shape_; } - - const float &Value() const { return value_; } - - private: - Variable *out_var_; - GType *out_; - int dtype_; - vector shape_; - float value_; -}; -#endif - -#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP -template -class FillConstantBatchSizeLikeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FillConstantBatchSizeLikeParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputFrom(inputs, *scope); - out_var_ = OutVarFrom(outputs, *scope); - out_ = OutFrom(outputs, *scope); - dtype_ = GetAttr("dtype", attrs); - shape_ = GetAttr>("shape", attrs); - value_ = GetAttr("value", attrs); - input_dim_idx_ = GetAttr("input_dim_idx", attrs); - output_dim_idx_ = GetAttr("output_dim_idx", attrs); - } - - Variable *OutVar() const { return out_var_; } - - const GType *Input() const { return input_; } - - GType *Out() const { return out_; } - - const int &DataDtype() const { return dtype_; } - - const vector &Shape() const { return shape_; } - - const float &Value() const { return value_; } - - int InputDimIdx() const { return input_dim_idx_; } - - int OutputDimIdx() const { return output_dim_idx_; } - - private: - GType *input_; - Variable *out_var_; - GType *out_; - int dtype_; - vector shape_; - float value_; - int input_dim_idx_; - int output_dim_idx_; -}; -#endif - -#ifdef TRANSPOSE_OP -template -class TransposeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - axis_ = GetAttr>("axis", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - const vector &Axis() const { return axis_; } - - private: - GType *input_x_; - GType *out_; - vector axis_; -}; -#endif - -#ifdef TRANSPOSE2_OP -template -class Transpose2Param : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - output_xshape_ = OutputXShapeFrom(outputs, *scope); - axis_ = GetAttr>("axis", attrs); - } - - GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - GType *OutputXShape() const { return output_xshape_; } - - const vector &Axis() const { return axis_; } - - private: - GType *input_x_; - GType *out_; - GType *output_xshape_; - vector axis_; -}; -#endif - -#ifdef LOOKUP_OP -template -class LookupParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - LookupParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_w_ = InputWFrom(inputs, *scope); - input_ids_ = InputIdsFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - padding_idx_ = GetAttr("padding_idx", attrs); - } - - const GType *InputW() const { return input_w_; } - const GType *InputIds() const { return input_ids_; } - GType *Out() const { return out_; } - int64_t PaddingIdx() const { return padding_idx_; } - - private: - GType *input_w_; - GType *input_ids_; - GType *out_; - int64_t padding_idx_; -}; -#endif - -#ifdef CRF_OP -template -class CrfParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - // {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}}, - - CrfParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - // todo crf params - input_emission_ = InputEmissionFrom(inputs, *scope); - input_transition_ = InputTransitionFrom(inputs, *scope); - input_label_ = InputLabelFrom(inputs, *scope); - output_viterbipath_ = OutputViterbiPathFrom(outputs, *scope); - // padding_idx_ = GetAttr("padding_idx", attrs); - } - const GType *InputEmission() const { return input_emission_; } - const GType *InputTransition() const { return input_transition_; } - const GType *InputLabel() const { return input_label_; } - GType *outputVBP() const { return output_viterbipath_; } - // const GType *InputIds() const { return input_ids_; } - // GType *Out() const { return out_; } - // int64_t PaddingIdx() const { return padding_idx_; } - - private: - GType *input_emission_; - GType *input_transition_; - GType *input_label_; - GType *output_viterbipath_; - - // GType *input_ids_; - // GType *out_; - // int64_t padding_idx_; -}; -#endif - -#ifdef RESHAPE_OP -template -class ReshapeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_shape_ = InputShapeFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - shape_ = GetAttr>("shape", attrs); - - if (HasAttr("inplace", attrs)) { - inplace_ = GetAttr("inplace", attrs); - } else { - inplace_ = false; - DLOG << "ReshapeParam lost inplace params. maybe fluid updated"; - } - } - - const GType *InputX() const { return input_x_; } - - const GType *InputShape() const { return input_shape_; } - - GType *Out() const { return out_; } - - const vector &Shape() const { return shape_; } - - const bool &Inplace() const { return inplace_; } - - private: - GType *input_x_; - GType *input_shape_; - GType *out_; - vector shape_; - bool inplace_; -}; -#endif - -#ifdef RESHAPE2_OP -template -class Reshape2Param : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - Reshape2Param(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_shape_ = InputShapeFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - output_xshape_ = OutputXShapeFrom(outputs, *scope); - shape_ = GetAttr>("shape", attrs); - if (HasAttr("inplace", attrs)) { - inplace_ = GetAttr("inplace", attrs); - } else { - inplace_ = false; - } - } - - GType *InputX() const { return input_x_; } - - const GType *InputShape() const { return input_shape_; } - - GType *Out() const { return out_; } - - GType *OutputXShape() const { return output_xshape_; } - - const vector &Shape() const { return shape_; } - - const bool &Inplace() const { return inplace_; } - - private: - GType *input_x_; - GType *input_shape_; - GType *out_; - GType *output_xshape_; - vector shape_; - bool inplace_; -}; -#endif - -#ifdef SCALE_OP -template -class ScaleParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ScaleParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - scale_ = GetAttr("scale", attrs); - bias_ = GetAttr("bias", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - const float Scale() const { return scale_; } - - const float Bias() const { return bias_; } - - private: - GType *input_x_; - GType *out_; - float scale_; - float bias_; -}; -#endif - -#ifdef SLICE_OP -template -class SliceParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - SliceParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - - axes_ = GetAttr>("axes", attrs); - starts_ = GetAttr>("starts", attrs); - ends_ = GetAttr>("ends", attrs); - - original_output_dims_size_ = output_->dims().size(); - } - - public: - GType *input_; - GType *output_; - std::vector axes_; - std::vector starts_; - std::vector ends_; - int original_output_dims_size_; -}; -#endif - -#ifdef RESIZE_OP -template -class ResizeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ResizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_shape_ = InputShapeFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - is_pyramid_test_ = GetAttr("is_pyramid_test", attrs); - height_ = GetAttr("height", attrs); - width_ = GetAttr("width", attrs); - out_height_scale_ = GetAttr("out_height_scale", attrs); - out_width_scale_ = GetAttr("out_width_scale", attrs); - } - - const GType *InputX() const { return input_x_; } - - const GType *InputShape() const { return input_shape_; } - - GType *Out() const { return out_; } - - const bool &IsPyramidTest() const { return is_pyramid_test_; } - - const int &Height() const { return height_; } - - const int &Width() const { return width_; } - - const float &OutHeightScale() const { return out_height_scale_; } - - const float &OutWidthScale() const { return out_width_scale_; } - - private: - GType *input_x_; - GType *input_shape_; - GType *out_; - bool is_pyramid_test_; - int height_; - int width_; - float out_height_scale_; - float out_width_scale_; -}; -#endif - -#ifdef RELU_OP -/* - * @b op 层实例化好这个 param 传递给 kernel 层使用 - * */ -template -class ReluParamBase : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ReluParamBase(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - private: - GType *input_x_; - GType *out_; -}; - -template -class ReluParam : public ReluParamBase { - public: - using ReluParamBase::ReluParamBase; -}; - -template -class Relu6Param : public ReluParamBase { - public: - Relu6Param(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ReluParamBase(inputs, outputs, attrs, scope) { - threshold = OpParam::GetAttr("threshold", attrs); - } - float getThreshold() const { return threshold; } - - private: - float threshold; -}; - -#ifdef PADDLE_MOBILE_CL -template <> -class ReluParam : public ReluParamBase { - public: - using ReluParamBase::ReluParamBase; - framework::CLImage &getMidImage() { return midImage; } - - private: - framework::CLImage midImage; -}; -#endif - -#endif - -#ifdef TANH_OP -template -class TanhParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - } - const GType *InputX() const { return input_x_; } - GType *Out() const { return out_; } - - private: - GType *input_x_; - GType *out_; -#ifdef PADDLE_MOBILE_FPGA - - private: - std::shared_ptr float_input_x_; - fpga::BypassArgs fpga_bypass_args; - - public: - GType *FloatInput() const { - return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); - } - void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); } - const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } - void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } -#endif -}; -#endif - -#ifdef PRELU_OP -template -class PReluParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - PReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - DLOG << "PReluParam inputs before"; - input_x_ = InputXFrom(inputs, *scope); - alpha_ = InputAlphaFrom(inputs, *scope); - framework::DDim dims = alpha_->dims(); - out_ = OutFrom(outputs, *scope); - mode_ = GetStringAttr("mode", attrs); - DLOG << "PReluParam mode after" << mode_; - } - const GType *InputX() const { return input_x_; } - const GType *InputAlpha() const { return alpha_; } - GType *Out() const { return out_; } - const std::string &Mode() const { return mode_; } - - private: - GType *input_x_; - GType *out_; - GType *alpha_; - std::string mode_; -}; -#endif - -#ifdef LEAKY_RELU_OP -template -class LeakyReluParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - LeakyReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - alpha_ = GetAttr("alpha", attrs); - } - const GType *InputX() const { return input_x_; } - const float Alpha() const { return alpha_; } - GType *Out() const { return out_; } - - private: - GType *input_x_; - GType *out_; - float alpha_; -}; -#endif - -template -class FusionFcParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_y_ = InputYFrom(inputs, *scope); - input_z_ = InputZFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - x_num_col_dims_ = GetAttr("x_num_col_dims", attrs); - y_num_col_dims_ = GetAttr("y_num_col_dims", attrs); - axis_ = GetAttr("axis", attrs); - } - GType *InputX() const { return input_x_; } - - GType *InputY() const { return input_y_; } - - GType *InputZ() const { return input_z_; } - - GType *Out() const { return out_; } - - const int &XNumColDims() const { return x_num_col_dims_; } - - const int &YNumColDims() const { return y_num_col_dims_; } - - const int &Axis() const { return axis_; } - - private: - GType *input_x_; - GType *input_y_; - GType *input_z_; - GType *out_; - int x_num_col_dims_; - int y_num_col_dims_; - int axis_; - -#ifdef PADDLE_MOBILE_FPGA - private: // NOLINT - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif -}; - -#ifdef FUSION_FCRELU_OP -template -using FusionFcReluParam = FusionFcParam; -#endif - -template -class FusionConvAddParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvAddParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, *scope); - axis_ = OpParam::GetAttr("axis", attrs); - this->output_ = OpParam::OutFrom(outputs, *scope); - } - GType *Bias() const { return bias_; } - - const int &Axis() const { return axis_; } - - protected: - GType *bias_; - int axis_; -}; - -template -Print &operator<<(Print &printer, const FusionConvAddParam &conv_param); - -#ifdef FUSION_CONVADDRELU_OP -template -class FusionConvAddReluParam : public FusionConvAddParam { - public: - FusionConvAddReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : FusionConvAddParam(inputs, outputs, attrs, scope) {} -}; -#endif - -#ifdef FUSION_CONVADDPRELU_OP -template -class FusionConvAddPReluParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvAddPReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - alpha_ = OpParam::InputAlphaFrom(inputs, *scope); - mode_ = OpParam::GetStringAttr("mode", attrs); - framework::DDim dims = alpha_->dims(); - bias_ = OpParam::InputYFrom(inputs, *scope); - axis_ = OpParam::GetAttr("axis", attrs); - this->output_ = OpParam::OutFrom(outputs, *scope); - } - const GType *InputAlpha() const { return alpha_; } - const std::string &Mode() const { return mode_; } - GType *Bias() const { return bias_; } - const int &Axis() const { return axis_; } - - protected: - GType *bias_; - int axis_; - GType *alpha_; - std::string mode_; -}; -#endif - -#ifdef FUSION_CONVADDADDPRELU_OP -template -class FusionConvAddAddPReluParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvAddAddPReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - bias1_ = OpParam::InputYFrom1(inputs, *scope); - alpha_ = OpParam::InputAlphaFrom(inputs, *scope); - mode_ = OpParam::GetStringAttr("mode", attrs); - framework::DDim dims = alpha_->dims(); - bias_ = OpParam::InputYFrom(inputs, *scope); - axis_ = OpParam::GetAttr("axis", attrs); - keyOutput_ = OpParam::Getkey("addOut", inputs, 0); - keyX1_ = OpParam::Getkey("addX", inputs, 1); - keyY1_ = OpParam::Getkey("Y", inputs, 1); - if (keyX1_ == keyOutput_) { - bias1_ = OpParam::InputYFrom1(inputs, *scope); - } else if (keyY1_ == keyOutput_) { - bias1_ = OpParam::InputXFrom1(inputs, *scope); - } - this->output_ = OpParam::OutFrom(outputs, *scope); - } - const GType *InputAlpha() const { return alpha_; } - const std::string &Mode() const { return mode_; } - const GType *Bias1() const { return bias1_; } - - GType *Bias() const { return bias_; } - - const int &Axis() const { return axis_; } - - protected: - GType *bias_; - int axis_; - GType *alpha_; - std::string mode_; - GType *bias1_; - std::string keyOutput_; - std::string keyX1_; - std::string keyY1_; -}; -#endif - -#ifdef FUSION_CONVADDBNRELU_OP -template -class FusionConvAddBNReluParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvAddBNReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, *scope); - axis_ = OpParam::GetAttr("axis", attrs); - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - this->output_ = OpParam::OutFrom(outputs, *scope); - } - - ~FusionConvAddBNReluParam() {} - - GType *Bias() const { return bias_; } - - const int &Axis() const { return axis_; } - - const GType *InputBias() const { return input_bias_; } - - const GType *InputMean() const { return input_mean_; } - - const GType *InputScale() const { return input_scale_; } - - const GType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - void SetNewScale(GType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(GType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const GType *NewScale() const { return new_scale_.get(); } - - const GType *NewBias() const { return new_bias_.get(); } - - protected: - GType *bias_; - int axis_; - GType *input_bias_; - GType *input_mean_; - GType *input_scale_; - GType *input_variance_; - float epsilon_; - float momentum_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; -#endif - -#ifdef FUSION_CONVBNADDRELU_OP -template -class FusionConvBNAddReluParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvBNAddReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, *scope); - axis_ = OpParam::GetAttr("axis", attrs); - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - keyBNY_ = OpParam::Getkey("BNY", inputs, 0); - keyX_ = OpParam::Getkey("X", inputs, 0); - keyY_ = OpParam::Getkey("Y", inputs, 0); - if (keyX_ == keyBNY_) { - bias_ = OpParam::InputYFrom(inputs, *scope); - } else if (keyY_ == keyBNY_) { - bias_ = OpParam::InputXFrom(inputs, *scope); - } - this->output_ = OpParam::OutFrom(outputs, *scope); - } - - ~FusionConvBNAddReluParam() {} - GType *Bias() const { return bias_; } - - const int &Axis() const { return axis_; } - - const GType *InputBias() const { return input_bias_; } - - const GType *InputMean() const { return input_mean_; } - - const GType *InputScale() const { return input_scale_; } - - const GType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - void SetNewScale(GType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(GType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const GType *NewScale() const { return new_scale_.get(); } - - const GType *NewBias() const { return new_bias_.get(); } - - protected: - GType *bias_; - int axis_; - GType *input_bias_; - GType *input_mean_; - GType *input_scale_; - GType *input_variance_; - float epsilon_; - float momentum_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; - std::string keyBNY_; - std::string keyX_; - std::string keyY_; -}; -#endif - -#ifdef FUSION_CONVBN_OP -template -class FusionConvBNParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvBNParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - this->output_ = OpParam::OutputYFrom(outputs, *scope); - } - - const GType *InputBias() const { return input_bias_; } - - const GType *InputMean() const { return input_mean_; } - - const GType *InputScale() const { return input_scale_; } - - const GType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - void SetNewScale(GType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(GType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const GType *NewScale() const { return new_scale_.get(); } - - const GType *NewBias() const { return new_bias_.get(); } - - protected: - GType *input_bias_; - GType *input_mean_; - GType *input_scale_; - GType *input_variance_; - float epsilon_; - float momentum_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; -#endif - -#ifdef FUSION_CONVADDBN_OP -template -class FusionConvAddBNParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvAddBNParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, *scope); - axis_ = OpParam::GetAttr("axis", attrs); - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - this->output_ = OpParam::OutputYFrom(outputs, *scope); - } - GType *Bias() const { return bias_; } - - const int &Axis() const { return axis_; } - - const GType *InputBias() const { return input_bias_; } - - const GType *InputMean() const { return input_mean_; } - - const GType *InputScale() const { return input_scale_; } - - const GType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - void SetNewScale(GType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(GType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const GType *NewScale() const { return new_scale_.get(); } - - const GType *NewBias() const { return new_bias_.get(); } - - protected: - GType *bias_; - int axis_; - GType *input_bias_; - GType *input_mean_; - GType *input_scale_; - GType *input_variance_; - float epsilon_; - float momentum_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; -#endif - -#ifdef FUSION_DWCONVBNRELU_OP -template -class FusionDWConvBNReluParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionDWConvBNReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - this->output_ = OpParam::OutFrom(outputs, *scope); - } - - ~FusionDWConvBNReluParam() {} - - const GType *InputBias() const { return input_bias_; } - - const GType *InputMean() const { return input_mean_; } - - const GType *InputScale() const { return input_scale_; } - - const GType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - void SetNewScale(GType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(GType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const GType *NewScale() const { return new_scale_.get(); } - - const GType *NewBias() const { return new_bias_.get(); } - - protected: - GType *input_bias_; - GType *input_mean_; - GType *input_scale_; - GType *input_variance_; - float epsilon_; - float momentum_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; - -#endif - -#ifdef FUSION_CONVRELU_OP -template -class FusionConvReluParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - this->output_ = OpParam::OutFrom(outputs, *scope); - } -}; -#endif - -#ifdef FUSION_CONVBNRELU_OP -template -class FusionConvBNReluParam : public ConvParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionConvBNReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvParam(inputs, outputs, attrs, scope) { - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - this->output_ = OpParam::OutFrom(outputs, *scope); - } - - ~FusionConvBNReluParam() {} - - const GType *InputBias() const { return input_bias_; } - - const GType *InputMean() const { return input_mean_; } - - const GType *InputScale() const { return input_scale_; } - - const GType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - void SetNewScale(GType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(GType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const GType *NewScale() const { return new_scale_.get(); } - - const GType *NewBias() const { return new_bias_.get(); } - - protected: - GType *input_bias_; - GType *input_mean_; - GType *input_scale_; - GType *input_variance_; - float epsilon_; - float momentum_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; -#endif - -#ifdef IM2SEQUENCE_OP -template -class Im2SequenceParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - Im2SequenceParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - kernels_ = GetAttr>("kernels", attrs); - strides_ = GetAttr>("strides", attrs); - paddings_ = GetAttr>("paddings", attrs); - } - - const GType *Input() const { return input_x_; } - - GType *Output() const { return out_; } - - const vector &Kernels() const { return kernels_; } - - const vector &Strides() const { return strides_; } - - const vector &Paddings() const { return paddings_; } - - private: - GType *input_x_; - GType *out_; - vector kernels_; - vector strides_; - vector paddings_; -}; -#endif - -#ifdef DROPOUT_OP -template -class DropoutParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - DropoutParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - - dropout_prob_ = GetAttr("dropout_prob", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - float DropoutProb() const { return dropout_prob_; } - - private: - GType *input_x_; - GType *out_; - float dropout_prob_; -}; -#endif - -template -class ConvTransposeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ConvTransposeParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - filter_ = OpParam::FilterFrom(inputs, *scope); - input_ = OpParam::InputFrom(inputs, *scope); - // output_ = OutputFrom(outputs, scope); - if (outputs.count("Output")) { - output_ = OpParam::OutputFrom(outputs, *scope); - } - strides_ = GetAttr>("strides", attrs); - paddings_ = GetAttr>("paddings", attrs); - dilations_ = GetAttr>("dilations", attrs); - if (HasAttr("output_size", attrs)) { - output_size_ = GetAttr>("output_size", attrs); - DLOG << "conv transpose output size: " << output_size_; - } - groups = GetAttr("groups", attrs); - } - - const GType *Input() const { return input_; } - - GType *Filter() const { return filter_; } - - GType *Output() const { return output_; } - - const vector &Strides() const { return strides_; } - - const vector &Paddings() const { return paddings_; } - - const vector &Filters() const { return filter_; } - - const vector &TransFilters() const { return transformed_filter_; } - - const vector &Dilations() const { return dilations_; } - - const vector &OutputSize() const { return output_size_; } - - const int &Groups() const { return groups; } - - enum ExecMode { - EXEC_INVALID = 0, - EXEC_GEMM_FLOAT, - EXEC_DECONV3X3_FLOAT, - EXEC_DECONV4X4_FLOAT, - EXEC_DEPTHWISETRANS_FLOAT, - EXEC_CONVTRANS3x3s2_FLOAT, - EXEC_CONVTRANS_FLOAT, - }; - - ExecMode &ExecMode() const { return exec_mode_; } - - private: - GType *input_; - GType *output_; - GType *filter_; - GType *transformed_filter_; - vector strides_; - vector paddings_; - vector dilations_; - vector output_size_; - int groups; - mutable enum ExecMode exec_mode_; - -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::DeconvArgs fpga_conv_args; - fpga::DWDeconvArgs fpga_DWDeconv_args; - - public: - const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; } - const fpga::DWDeconvArgs &FpgaDWDconvArgs() const { - return fpga_DWDeconv_args; - } - void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; } - void SetFpgaArgs(const fpga::DWDeconvArgs &args) { - fpga_DWDeconv_args = args; - } -#endif -}; - -#ifdef FUSION_DECONVADD_OP -template -class FusionDeconvAddParam : public ConvTransposeParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionDeconvAddParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvTransposeParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, *scope); - axis_ = OpParam::GetAttr("axis", attrs); - output_ = OpParam::OutFrom(outputs, *scope); - } - GType *Bias() const { return bias_; } - - const int &Axis() const { return axis_; } - - GType *Output() const { return output_; } - - protected: - GType *bias_; - int axis_; - GType *output_; -}; -#endif - -#ifdef FUSION_DECONVADDRELU_OP -template -using FusionDeconvAddReluParam = FusionDeconvAddParam; -#endif -#ifdef FUSION_DECONVADDBN_OP -template -class FusionDeconvAddBNParam : public ConvTransposeParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionDeconvAddBNParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvTransposeParam(inputs, outputs, attrs, scope) { - output_ = OpParam::OutFrom(outputs, *scope); - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - // is_test_ = OpParam::GetAttr("is_test", attrs); - } - RType *Output() const { return output_; } - - const RType *InputBias() const { return input_bias_; } - - const RType *InputMean() const { return input_mean_; } - - const RType *InputScale() const { return input_scale_; } - - const RType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - const bool &IsTest() const { return is_test_; } - - void SetNewScale(RType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(RType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const RType *NewScale() const { return new_scale_.get(); } - - const RType *NewBias() const { return new_bias_.get(); } - - protected: - RType *output_; - RType *input_bias_; - RType *input_mean_; - RType *input_scale_; - RType *input_variance_; - float epsilon_; - float momentum_; - bool is_test_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; -#endif -#ifdef FUSION_DECONVBNRELU_OP -template -class FusionDeconvBNReluParam : public ConvTransposeParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionDeconvBNReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvTransposeParam(inputs, outputs, attrs, scope) { - output_ = OpParam::OutFrom(outputs, *scope); - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - } - RType *Output() const { return output_; } - - const RType *InputBias() const { return input_bias_; } - - const RType *InputMean() const { return input_mean_; } - - const RType *InputScale() const { return input_scale_; } - - const RType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - const bool &IsTest() const { return is_test_; } - - void SetNewScale(RType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(RType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const RType *NewScale() const { return new_scale_.get(); } - - const RType *NewBias() const { return new_bias_.get(); } - - protected: - RType *output_; - RType *input_bias_; - RType *input_mean_; - RType *input_scale_; - RType *input_variance_; - float epsilon_; - float momentum_; - bool is_test_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; -#endif -#ifdef FUSION_DECONVADDBNRELU_OP -template -class FusionDeconvAddBNReluParam : public ConvTransposeParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionDeconvAddBNReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : ConvTransposeParam(inputs, outputs, attrs, scope) { - output_ = OpParam::OutFrom(outputs, *scope); - input_bias_ = OpParam::InputBiasFrom(inputs, *scope); - input_mean_ = OpParam::InputMeanFrom(inputs, *scope); - input_scale_ = OpParam::InputScaleFrom(inputs, *scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - momentum_ = OpParam::GetAttr("momentum", attrs); - // is_test_ = OpParam::GetAttr("is_test", attrs); - } - RType *Output() const { return output_; } - - const RType *InputBias() const { return input_bias_; } - - const RType *InputMean() const { return input_mean_; } - - const RType *InputScale() const { return input_scale_; } - - const RType *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - const bool &IsTest() const { return is_test_; } - - void SetNewScale(RType *new_scale) { - new_scale_.reset(new_scale, CLImageDeleter()); - } - - void SetNewBias(RType *new_bias) { - new_bias_.reset(new_bias, CLImageDeleter()); - } - - const RType *NewScale() const { return new_scale_.get(); } - - const RType *NewBias() const { return new_bias_.get(); } - - protected: - RType *output_; - RType *input_bias_; - RType *input_mean_; - RType *input_scale_; - RType *input_variance_; - float epsilon_; - float momentum_; - bool is_test_; - std::shared_ptr new_bias_; - std::shared_ptr new_scale_; -}; -#endif - -#ifdef FUSION_DECONVRELU_OP -template -using FusionDeconvReluParam = ConvTransposeParam; -#endif - -#ifdef GRU_OP -template -class GruParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - - public: - /** - * - * @param inputs - * @param outputs - * @param attrs - * @param scope - * */ - GruParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_input_ = InputFrom(inputs, *scope); - input_h0_ = InputH0From(inputs, *scope); - input_bias_ = InputBiasFrom(inputs, *scope); - input_weight_ = InputWeightFrom(inputs, *scope); - - output_batch_gate_ = OutputBatchGateFrom(outputs, *scope); - output_batch_reset_hidden_prev_ = - OutputBatchResetHiddenPrevFrom(outputs, *scope); - output_batch_hidden_ = OutputBatchHiddenFrom(outputs, *scope); - output_hidden_ = OutputHiddenFrom(outputs, *scope); - activation_ = GetStringAttr("activation", attrs); - gate_activation_ = GetStringAttr("gate_activation", attrs); - is_reverse_ = GetAttr("is_reverse", attrs); - } - const GType *InputInput() const { return input_input_; } - const GType *InputWeight() const { return input_weight_; } - const GType *InputH0() const { return input_h0_; } - const GType *InputBias() const { return input_bias_; } - const std::string &Activation() const { return activation_; } - const std::string &GateActivation() const { return gate_activation_; } - const bool &IsReverse() const { return is_reverse_; } - - GType *OutBatchGate() const { return output_batch_gate_; } - GType *OutBatchResetHiddenPrev() const { - return output_batch_reset_hidden_prev_; - } - GType *OutBatchHidden() const { return output_batch_hidden_; } - GType *OutHidden() const { return output_hidden_; } - - private: - GType *input_input_; - GType *input_h0_; - GType *input_bias_; - GType *input_weight_; - - GType *output_batch_gate_; - GType *output_batch_reset_hidden_prev_; - GType *output_batch_hidden_; - GType *output_hidden_; - std::string activation_; - std::string gate_activation_; - bool is_reverse_; -}; -#endif - -#ifdef GRU_UNIT_OP -template -class GruUnitParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - - public: - GruUnitParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_input_ = InputFrom(inputs, *scope); - input_hidden_prev_ = InputHiddenPrevFrom(inputs, *scope); - input_bias_ = InputBiasFrom(inputs, *scope); - input_weight_ = InputWeightFrom(inputs, *scope); - - output_gate_ = OutputGateFrom(outputs, *scope); - output_reset_hidden_prev_ = - OutputResetHiddenPrevFrom(outputs, *scope); - output_hidden_ = OutputHiddenFrom(outputs, *scope); - activation_ = GetAttr("activation", attrs); - gate_activation_ = GetAttr("gate_activation", attrs); - } - const GType *InputInput() const { return input_input_; } - const GType *InputWeight() const { return input_weight_; } - const GType *InputHiddenPrev() const { return input_hidden_prev_; } - const GType *InputBias() const { return input_bias_; } - const int &Activation() const { return activation_; } - const int &GateActivation() const { return gate_activation_; } - - GType *OutGate() const { return output_gate_; } - GType *OutResetHiddenPrev() const { return output_reset_hidden_prev_; } - GType *OutHidden() const { return output_hidden_; } - - private: - GType *input_input_; - GType *input_hidden_prev_; - GType *input_bias_; - GType *input_weight_; - - GType *output_gate_; - GType *output_reset_hidden_prev_; - GType *output_hidden_; - int activation_; - int gate_activation_; -}; -#endif - -#ifdef FLATTEN_OP -template -class FlattenParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FlattenParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - axis = GetAttr("axis", attrs); - } - const GType *InputX() const { return input_x_; } - GType *Out() const { return out_; } - const int &Axis() const { return axis; } - - private: - GType *input_x_; - GType *out_; - int axis; -}; -#endif - -#ifdef SPLIT_OP -template -class SplitParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - SplitParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - outs_ = OutMultiFrom(outputs, *scope); - axis = GetAttr("axis", attrs); - num = GetAttr("num", attrs); - sections = GetAttr>("sections", attrs); - - // for (int i = 0; i < outs_.size(); ++i) { - // out_ts_.push_back(*scope.FindVar(outs_[i])->GetMutable()); - // } - } - GType *InputX() const { return input_x_; } - std::vector Outs() const { return outs_; } - int Axis() const { return axis; } - int Num() const { return num; } - std::vector Sections() const { return sections; } - // std::vector OutTs() const { return out_ts_; } - - private: - GType *input_x_; - std::vector outs_; - int axis; - int num; - std::vector sections; -// std::vector out_ts_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitArgs fpga_split_args; - - public: - const fpga::SplitArgs &FpgaArgs() const { return fpga_split_args; } - void SetFpgaArgs(const fpga::SplitArgs &args) { fpga_split_args = args; } -#endif -}; -#endif - -#ifdef BILINEAR_INTERP_OP -template -class BilinearInterpParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - BilinearInterpParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_outsize_ = InputOutSizeFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - out_h_ = GetAttr("out_h", attrs); - out_w_ = GetAttr("out_w", attrs); - align_corners = GetAttr("align_corners", attrs); - align_mode = GetAttr("align_mode", attrs); - if (HasAttr("scale", attrs)) { - has_scale_ = true; - scale_ = GetAttr("scale", attrs); - } - LOG(kLOG_DEBUG1) << "has_scale_: " << has_scale_; - LOG(kLOG_DEBUG1) << "scale_: " << scale_; - } - const GType *InputX() const { return input_x_; } - const GType *InputOutPutSize() const { return input_outsize_; } - GType *Out() const { return out_; } - int OutH() const { return out_h_; } - int OutW() const { return out_w_; } - bool AlignCorners() const { return align_corners; } - int AlignMode() const { return align_mode; } - float Scale() const { return scale_; } - bool HasScale() const { return has_scale_; } - - private: - GType *input_x_; - GType *input_outsize_; - GType *out_; - int out_h_; - int out_w_; - bool align_corners; - int align_mode; - float scale_; - bool has_scale_; -}; -#endif - -#ifdef NEAREST_INTERP_OP -template -class NearestInterpolationParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - NearestInterpolationParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - const bool has_out_size = HasVar("OutSize", inputs); - - if (has_out_size) { - input_outsize_ = InputOutSizeFrom(inputs, *scope); - } - - out_ = OutFrom(outputs, *scope); - - if (HasAttr("out_h", attrs)) { - out_h_ = GetAttr("out_h", attrs); - } else if (HasAttr("out_h ", attrs)) { - // some models hurts .... attr with space .. - out_h_ = GetAttr("out_h ", attrs); - } - - if (HasAttr("out_w", attrs)) { - out_w_ = GetAttr("out_w", attrs); - } else if (HasAttr("out_w ", attrs)) { - // some models hurts .... attr with space .. - out_w_ = GetAttr("out_w ", attrs); - } - - LOG(kLOG_DEBUG1) << "out_h_: " << out_h_; - LOG(kLOG_DEBUG1) << "out_w_: " << out_w_; - - if (HasAttr("scale", attrs)) { - has_scale_ = true; - scale_ = GetAttr("scale", attrs); - } - LOG(kLOG_DEBUG1) << "has_scale_: " << has_scale_; - LOG(kLOG_DEBUG1) << "scale_: " << scale_; - } - const GType *InputX() const { return input_x_; } - const GType *InputOutPutSize() const { return input_outsize_; } - GType *Out() const { return out_; } - int OutH() const { return out_h_; } - int OutW() const { return out_w_; } - float Scale() const { return scale_; } - bool HasScale() const { return has_scale_; } - - private: - GType *input_x_; - GType *input_outsize_; - GType *out_; - int out_h_; - int out_w_; - float scale_; - bool has_scale_; -}; -#endif - -#ifdef SHAPE_OP -template -class ShapeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ShapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - } - const GType *Input() const { return input_; } - GType *Out() const { return out_; } - - private: - GType *input_; - GType *out_; -}; -#endif - -#ifdef TOP_K_OP -template -class TopKParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - TopKParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = OpParam::GetVarValue("X", inputs, *scope); - output_ = OpParam::GetVarValue("Out", outputs, *scope); - indices_ = OpParam::GetVarValue("Indices", outputs, *scope); - k_ = OpParam::GetAttr("k", attrs); - } - - public: - GType *input_; - GType *output_; - GType *indices_; - int k_; -}; -#endif // TOP_K_OP - -#ifdef CAST_OP -template -class CastParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - CastParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = OpParam::GetVarValue("X", inputs, *scope); - output_ = OpParam::GetVarValue("Out", outputs, *scope); - input_type_ = OpParam::GetAttr("in_dtype", attrs); - output_type_ = OpParam::GetAttr("out_dtype", attrs); - } - - public: - GType *input_; - GType *output_; - int input_type_; - int output_type_; -}; -#endif // CAST_OP - -#ifdef QUANT_OP -template -class QuantizeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - QuantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputXFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - // online - // scale = max(abs(x)) - online_scale_ = OpParam::GetVarValue("OutScale", outputs, *scope); - // offline - if (inputs.count("InScale")) { - offline_ = true; - offline_scale_ = OpParam::GetVarValue("InScale", inputs, *scope); - } - // x = round(scale * x) - if (OpParam::HasAttr("round_type", attrs)) { - round_type_ = OpParam::GetAttr("round_type", attrs); - } - } - - public: - // op input - GType *input_; - // op output - GType *output_; - GType *online_scale_; - // quantize offline scale - GType *offline_scale_; - // if offine scale or not - bool offline_ = false; - // round method type - // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO; - RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO; -}; -#endif - -#ifdef DEQUANT_OP -template -class DequantizeParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputXFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - activation_scale_ = OpParam::GetVarValue("Scale", inputs, *scope); - // dequantization is performed as x = x / static_scale / online_scale - if (OpParam::HasAttr("weight_scale", attrs)) { - weight_scale_ = OpParam::GetAttr("weight_scale", attrs); - } else { - weight_scale_ = OpParam::GetAttr("max_range", attrs); - } - } - - public: - // op input - GType *input_; - // op output - GType *output_; - GType *activation_scale_; - float weight_scale_; -}; -#endif - -#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \ - defined(FUSION_DEQUANT_BN_RELU_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP) -template -class FusionDequantBNParam : public DequantizeParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionDequantBNParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : DequantizeParam(inputs, outputs, attrs, scope) { - // batch norm params - bn_mean_ = OpParam::GetVarValue("BNMean", inputs, *scope); - bn_variance_ = OpParam::GetVarValue("BNVariance", inputs, *scope); - bn_scale_ = OpParam::GetVarValue("BNScale", inputs, *scope); - bn_bias_ = OpParam::GetVarValue("BNBias", inputs, *scope); - epsilon_ = OpParam::GetAttr("epsilon", attrs); - } - - public: - // batch norm - GType *bn_mean_; - GType *bn_variance_; - GType *bn_scale_; - GType *bn_bias_; - float epsilon_; -}; -#endif - -#if defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \ - defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP) -template -class FusionDequantAddBNParam : public FusionDequantBNParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionDequantAddBNParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : FusionDequantBNParam(inputs, outputs, attrs, scope) { - // element wise add params - axis_ = OpParam::GetAttr("axis", attrs); - bias_ = OpParam::InputYFrom(inputs, *scope); - } - - public: - // elementwise add - int axis_; - GType *bias_; -}; -#endif - -#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP -template -class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - FusionDequantAddBNQuantParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : FusionDequantAddBNParam(inputs, outputs, attrs, scope) { - // scale output - online_scale_ = OpParam::GetVarValue("OutScale", outputs, *scope); - // offline - if (inputs.count("InScale")) { - offline_ = true; - offline_scale_ = OpParam::GetVarValue("InScale", inputs, *scope); - } - // x = round(scale * x) - if (OpParam::HasAttr("round_type", attrs)) { - round_type_ = OpParam::GetAttr("round_type", attrs); - } - } - - public: - GType *online_scale_; - // quantize offline scale - GType *offline_scale_; - // if offine scale or not - bool offline_ = false; - // round method type - // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO; - RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO; -}; -#endif - -#ifdef SEQUENCE_EXPAND_OP -template -class SequenceExpandParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - SequenceExpandParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_y_ = InputYFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - ref_level_ = -1; - if (OpParam::HasAttr("ref_level", attrs)) { - ref_level_ = OpParam::GetAttr("ref_level", attrs); - } - } - - public: - GType *input_x_; - GType *input_y_; - GType *output_; - int ref_level_; -}; -#endif // SEQUENCE_EXPAND_OP - -#ifdef SEQUENCE_POOL_OP -template -class SequencePoolParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - SequencePoolParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = InputXFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - pool_type_ = "MAX"; - if (OpParam::HasAttr("pooltype", attrs)) { - pool_type_ = OpParam::GetStringAttr("pooltype", attrs); - } - } - - public: - GType *input_; - GType *output_; - std::string pool_type_; -}; -#endif // SEQUENCE_EXPAND_OP - -#ifdef LOD_RESET_OP -template -class LodResetParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - LodResetParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - input_y_ = nullptr; - if (inputs.count("Y")) { - input_y_ = InputYFrom(inputs, *scope); - } else { - target_lod_ = OpParam::GetAttr>("target_lod", attrs); - } - if (HasAttr("append", attrs)) { - append = OpParam::GetAttr("append", attrs); - } - } - - public: - GType *input_x_; - GType *input_y_; - GType *output_; - std::vector target_lod_; - bool append; -}; -#endif // LOD_RESET_OP - -#ifdef LESS_THAN_OP -template -class CompareParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - CompareParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_y_ = InputYFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - axis_ = OpParam::GetAttr("axis", attrs); - } - - public: - GType *input_x_; - GType *input_y_; - GType *output_; - int axis_; -}; -#endif // LESS_THAN_OP - -#if defined(LOGICAL_AND_OP) || defined(LOGICAL_OR_OP) || defined(LOGICAL_XOR_OP) -template -class LogicalBinaryParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - LogicalBinaryParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - input_y_ = InputYFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - } - - const GType *InputX() const { return input_x_; } - const GType *InputY() const { return input_y_; } - GType *Out() const { return output_; } - - public: - GType *input_x_; - GType *input_y_; - GType *output_; -}; -#endif // LOGICAL_AND_OP LOGICAL_OR_OP LOGICAL_XOR_OP - -#ifdef LOGICAL_NOT_OP -template -class LogicalUnaryParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - LogicalUnaryParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - } - - const GType *InputX() const { return input_x_; } - GType *Out() const { return output_; } - - public: - GType *input_x_; - GType *output_; -}; -#endif // LOGICAL_NOT_OP - -#ifdef WRITE_TO_ARRAY_OP -template -class WriteToArrayParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - WriteToArrayParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = OpParam::GetVarValue("X", inputs, *scope); - index_ = OpParam::GetVarValue("I", inputs, *scope); - output_ = OpParam::GetVarValue>("Out", outputs, *scope); - } - - public: - GType *input_; - GType *index_; - std::vector *output_; -}; -#endif - -#ifdef READ_FROM_ARRAY_OP -template -class ReadFromArrayParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ReadFromArrayParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_ = OpParam::GetVarValue>("X", inputs, *scope); - index_ = OpParam::GetVarValue("I", inputs, *scope); - output_ = OpParam::GetVarValue("Out", outputs, *scope); - } - - public: - std::vector *input_; - GType *index_; - GType *output_; -}; -#endif - -#ifdef IS_EMPTY_OP -template -class IsEmptyParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - IsEmptyParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - } - - const GType *InputX() const { return input_x_; } - GType *Out() const { return output_; } - - public: - GType *input_x_; - GType *output_; -}; -#endif // IS_EMPTY_OP - -#ifdef INCREMENT_OP -template -class IncrementParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - IncrementParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - output_ = OutFrom(outputs, *scope); - step_ = OpParam::GetAttr("step", attrs); - } - - const GType *InputX() const { return input_x_; } - GType *Out() const { return output_; } - float Step() const { return step_; } - - public: - GType *input_x_; - GType *output_; - float step_; -}; -#endif // INCREMENT_OP -#ifdef PAD2D_OP -template -class Pad2DParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - Pad2DParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - paddings_ = OpParam::GetAttr>("paddings", attrs); - pad_value_ = OpParam::GetAttr("pad_value", attrs); - mode_ = OpParam::GetStringAttr("mode", attrs); - DLOG << "mode" << mode_; - } - const GType *InputX() const { return input_x_; } - GType *Out() const { return out_; } - - std::vector paddings_; - float pad_value_; - std::string mode_; - - private: - GType *input_x_; - GType *out_; -}; -#endif -#ifdef EXP_OP -template -class EXPParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - EXPParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - } - const GType *InputX() const { return input_x_; } - GType *Out() const { return out_; } - - private: - GType *input_x_; - GType *out_; -}; -#endif - -#ifdef PIXEL_SHUFFLE_OP -template -class PixelShuffleParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - PixelShuffleParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - upscale_factor_ = GetAttr("upscale_factor", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - const int &upscale_factor() const { return upscale_factor_; } - - private: - GType *input_x_; - GType *out_; - int upscale_factor_; -}; -#endif - -#ifdef GRID_SAMPLER_OP -template -class GridSamplerParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - GridSamplerParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - grid_ = GridFrom(inputs, *scope); - output_ = OutputFrom(outputs, *scope); - } - - const GType *InputX() const { return input_x_; } - const GType *Grid() const { return grid_; } - - GType *Output() const { return output_; } - - private: - GType *input_x_; - GType *grid_; - GType *output_; -}; -#endif - -#ifdef EXPAND_OP -template -class ExpandParam : public OpParam { - typedef typename DtypeTensorTrait::gtype GType; - typedef typename DtypeTensorTrait::rtype RType; - - public: - ExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) - : OpParam(inputs, outputs, attrs, scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - expand_times = OpParam::GetAttr>("expand_times", attrs); - } - - const GType *InputX() const { return input_x_; } - - GType *Out() const { return out_; } - - std::vector expand_times; - - private: - GType *input_x_; - GType *out_; -}; - -#endif -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/pad2d_op.cpp b/mobile/src/operators/pad2d_op.cpp deleted file mode 100755 index d3ed4762e4..0000000000 --- a/mobile/src/operators/pad2d_op.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PAD2D_OP - -#include "operators/pad2d_op.h" -namespace paddle_mobile { -namespace operators { - -template -void Pad2DOp::InferShape() const { - auto input_dims = this->param_.InputX()->dims(); - const auto &paddings = this->param_.paddings_; - PADDLE_MOBILE_ENFORCE(paddings.size() == 4, - "Size of paddings should be equal to 4."); - - input_dims[2] += paddings[0] + paddings[1]; - input_dims[3] += paddings[2] + paddings[3]; - this->param_.Out()->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(pad2d, ops::Pad2DOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2DOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(pad2d, ops::Pad2DOp); -#endif -#endif // PAD2D_OP diff --git a/mobile/src/operators/pad2d_op.h b/mobile/src/operators/pad2d_op.h deleted file mode 100644 index 1a80cbac40..0000000000 --- a/mobile/src/operators/pad2d_op.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PAD2D_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/pad2d_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(Pad2D, Pad2DParam, Pad2DKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif // PAD2D_OP diff --git a/mobile/src/operators/pixel_shuffle_op.cpp b/mobile/src/operators/pixel_shuffle_op.cpp deleted file mode 100644 index 9105a72cfb..0000000000 --- a/mobile/src/operators/pixel_shuffle_op.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PIXEL_SHUFFLE_OP - -#include "operators/pixel_shuffle_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void PixelShuffleOp::InferShape() const { - auto x_dims = this->param_.InputX()->dims(); - int n = x_dims[0]; - int c = x_dims[1]; - int h = x_dims[2]; - int w = x_dims[3]; - int upscale_factor = this->param_.upscale_factor(); - this->param_.Out()->Resize( - framework::make_ddim({n, c / (upscale_factor * upscale_factor), - h * upscale_factor, w * upscale_factor})); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(pixel_shuffle, ops::PixelShuffleOp); -#endif - -#endif diff --git a/mobile/src/operators/pixel_shuffle_op.h b/mobile/src/operators/pixel_shuffle_op.h deleted file mode 100644 index a1c6f8e1ad..0000000000 --- a/mobile/src/operators/pixel_shuffle_op.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PIXEL_SHUFFLE_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/pixel_shuffle_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -using std::string; -template -class PixelShuffleOp : public framework::OperatorWithKernel< - DeviceType, PixelShuffleParam, - operators::PixelShuffleKernel> { - public: - PixelShuffleOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, PixelShuffleParam, - operators::PixelShuffleKernel>(type, inputs, outputs, - attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/polygon_box_transform_op.cpp b/mobile/src/operators/polygon_box_transform_op.cpp deleted file mode 100644 index a3eed0e2f3..0000000000 --- a/mobile/src/operators/polygon_box_transform_op.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POLYGONBOXTRANSFORM_OP - -#include "operators/polygon_box_transform_op.h" -namespace paddle_mobile { -namespace operators { - -template -void PolygonBoxTransformOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr, - "Input (Input) of get_shape op should not be null."); - PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr, - "Output (Output) of get_shape op should not be null."); - - auto input_dims = this->param_.Input()->dims(); - - PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "input's rank must be 4."); - PADDLE_MOBILE_ENFORCE(input_dims[1] % 2 == 0, - "input's second dimension must be even."); - - this->param_.Output()->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(polygon_box_transform, ops::PolygonBoxTransformOp); -#endif - -#endif diff --git a/mobile/src/operators/polygon_box_transform_op.h b/mobile/src/operators/polygon_box_transform_op.h deleted file mode 100644 index a4d1975e58..0000000000 --- a/mobile/src/operators/polygon_box_transform_op.h +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POLYGONBOXTRANSFORM_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/polygon_box_transform_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class PolygonBoxTransformOp - : public framework::OperatorWithKernel< - DeviceType, PolygonBoxTransformParam, - operators::PolygonBoxTransformKernel> { - public: - PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, PolygonBoxTransformParam, - operators::PolygonBoxTransformKernel>( - type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, PolygonBoxTransformParam, - operators::PolygonBoxTransformKernel>::OperatorWithKernel; - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/pool_op.cpp b/mobile/src/operators/pool_op.cpp deleted file mode 100644 index f73fe01cc7..0000000000 --- a/mobile/src/operators/pool_op.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#include "operators/pool_op.h" -#include -#include "framework/op_proto_maker.h" -#include "framework/op_registry.h" - -namespace paddle_mobile { -namespace operators { - -int PoolOutputSize(int input_size, int filter_size, int padding, int stride, - bool ceil_mode) { - int output_size; - if (!ceil_mode) { - output_size = (input_size - filter_size + 2 * padding) / stride + 1; - } else { - output_size = - (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; - } - return output_size; -} -template -void PoolOp::InferShape() const { - auto in_x_dims = this->param_.Input()->dims(); - std::vector ksize = this->param_.Ksize(); - std::vector paddings = this->param_.Paddings(); - std::vector strides = this->param_.Strides(); - bool ceil_mode = this->param_.isCeilMode(); - - if (this->param_.isGlobalPooling()) { - ksize.resize(static_cast(in_x_dims.size()) - 2); - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_dims[i + 2]); - } - } - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i], ceil_mode)); - } - this->param_.Output()->Resize(framework::make_ddim(output_shape)); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(pool2d, ops::PoolOp); -#endif - -#endif diff --git a/mobile/src/operators/pool_op.h b/mobile/src/operators/pool_op.h deleted file mode 100644 index 861430f10b..0000000000 --- a/mobile/src/operators/pool_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef POOL_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/pool_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class PoolOp : public framework::OperatorWithKernel< - DeviceType, PoolParam, - operators::PoolKernel> { - public: - PoolOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::PoolKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - private: -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/prelu_op.cpp b/mobile/src/operators/prelu_op.cpp deleted file mode 100644 index 0c373ca711..0000000000 --- a/mobile/src/operators/prelu_op.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PRELU_OP - -#include "operators/prelu_op.h" -namespace paddle_mobile { -namespace operators { - -template -void PReluOp::InferShape() const { - auto input_dims = this->param_.InputX()->dims(); - this->param_.Out()->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -/* - * @b 每一个 op 都需要注册一下的, - * USE_OP的参数 和 REGISTER_OPERATOR的第一个参数 - * 都是需要和model中类型对应起来的 - * */ -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(prelu, ops::PReluOp); -#endif - -#endif diff --git a/mobile/src/operators/prelu_op.h b/mobile/src/operators/prelu_op.h deleted file mode 100644 index 92c2e7e620..0000000000 --- a/mobile/src/operators/prelu_op.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PRELU_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/prelu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class PReluOp : public framework::OperatorWithKernel< - DeviceType, PReluParam, - operators::PReluKernel> { - public: - PReluOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::PReluKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/prior_box_op.cpp b/mobile/src/operators/prior_box_op.cpp deleted file mode 100644 index da37273de5..0000000000 --- a/mobile/src/operators/prior_box_op.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "operators/prior_box_op.h" -#include - -namespace paddle_mobile { -namespace operators { - -#ifdef PRIORBOX_OP -template -void PriorBoxOp::InferShape() const { - auto input_dims = this->param_.Input()->dims(); - auto input_image_dims = this->param_.InputImage()->dims(); - auto min_sizes = this->param_.MinSizes(); - auto max_sizes = this->param_.MaxSizes(); - auto variances = this->param_.Variances(); - auto aspect_ratios = this->param_.AspectRatios(); - bool flip = this->param_.Flip(); - std::vector aspect_ratios_vec; - ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec); - - size_t num_priors = aspect_ratios_vec.size() * min_sizes.size(); - if (!max_sizes.empty()) { - num_priors += max_sizes.size(); - } - - std::vector dim_vec(4); - dim_vec[0] = input_dims[2]; - dim_vec[1] = input_dims[3]; - dim_vec[2] = num_priors; - dim_vec[3] = 4; - this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec)); - this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec)); -} -#endif // PRIORBOX_OP - -#ifdef DENSITY_PRIORBOX_OP -template -void DensityPriorBoxOp::InferShape() const { - auto input_dims = this->param_.Input()->dims(); - auto input_image_dims = this->param_.InputImage()->dims(); - - auto &fixed_sizes = this->param_.FixedSizes(); - auto &fixed_ratios = this->param_.FixedRatios(); - auto &densities = this->param_.Densities(); - bool flatten = this->param_.FlattenTo2d(); - - size_t num_priors = 0; - for (size_t i = 0; i < densities.size(); ++i) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - if (!flatten) { - std::vector dim_vec(4); - dim_vec[0] = input_dims[2]; - dim_vec[1] = input_dims[3]; - dim_vec[2] = num_priors; - dim_vec[3] = 4; - this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec)); - this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec)); - } else { - int64_t dim0 = input_dims[2] * input_dims[3] * num_priors; - this->param_.OutputBoxes()->Resize(framework::make_ddim({dim0, 4})); - this->param_.OutputVariances()->Resize(framework::make_ddim({dim0, 4})); - } -} -#endif // DENSITY_PRIORBOX_OP - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -#ifdef PRIORBOX_OP -REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp); -#endif // PRIORBOX_OP -#ifdef DENSITY_PRIORBOX_OP -REGISTER_OPERATOR_CPU(density_prior_box, ops::DensityPriorBoxOp); -#endif // DENSITY_PRIORBOX_OP -#endif // PADDLE_MOBILE_CPU - -#ifdef PADDLE_MOBILE_CL -#ifdef PRIORBOX_OP -REGISTER_OPERATOR_CL(prior_box, ops::PriorBoxOp); -#endif // PRIORBOX_OP -#ifdef DENSITY_PRIORBOX_OP -REGISTER_OPERATOR_CL(density_prior_box, ops::DensityPriorBoxOp); -#endif // DENSITY_PRIORBOX_OP -#endif // PADDLE_MOBILE_CL diff --git a/mobile/src/operators/prior_box_op.h b/mobile/src/operators/prior_box_op.h deleted file mode 100644 index 7a3c0466a0..0000000000 --- a/mobile/src/operators/prior_box_op.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/prior_box_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -#ifdef PRIORBOX_OP -DECLARE_OPERATOR(PriorBox, PriorBoxParam, PriorBoxKernel); -#endif - -#ifdef DENSITY_PRIORBOX_OP -DECLARE_OPERATOR(DensityPriorBox, DensityPriorBoxParam, DensityPriorBoxKernel); -#endif - -} // namespace operators -} // namespace paddle_mobile diff --git a/mobile/src/operators/quantize_op.cpp b/mobile/src/operators/quantize_op.cpp deleted file mode 100644 index bf12ca2f83..0000000000 --- a/mobile/src/operators/quantize_op.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef QUANT_OP - -#include "operators/quantize_op.h" -#include - -namespace paddle_mobile { -namespace operators { - -template -void QuantizeOp::InferShape() const { - const auto &input_dims = this->param_.input_->dims(); - this->param_.output_->Resize(input_dims); - auto scale_dims = framework::make_ddim(std::vector{1}); - this->param_.online_scale_->Resize(scale_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp); -#endif - -#endif // QUANT_OP diff --git a/mobile/src/operators/quantize_op.h b/mobile/src/operators/quantize_op.h deleted file mode 100644 index 253113ad4b..0000000000 --- a/mobile/src/operators/quantize_op.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef QUANT_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/quantize_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class QuantizeOp : public framework::OperatorWithKernel< - DeviceType, QuantizeParam, - operators::QuantizeKernel> { - public: - QuantizeOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::QuantizeKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // QUANT_OP diff --git a/mobile/src/operators/range_op.cpp b/mobile/src/operators/range_op.cpp deleted file mode 100644 index b7abb52f0f..0000000000 --- a/mobile/src/operators/range_op.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RANGE_OP - -#include "operators/range_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void RangeOp::InferShape() const { - auto s_dims = this->param_.Start()->dims(); - PADDLE_MOBILE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1), - "The shape of Input(Start) should be [1]."); - auto e_dims = this->param_.End()->dims(); - PADDLE_MOBILE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1), - "The shape of Input(End) should be [1]."); - auto step_dims = this->param_.Step()->dims(); - PADDLE_MOBILE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1), - "The shape of Input(Step) should be [1]."); - this->param_.Output()->Resize({-1}); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(range, ops::RangeOp); -#endif - -#endif // ASSIGN_OP diff --git a/mobile/src/operators/range_op.h b/mobile/src/operators/range_op.h deleted file mode 100644 index a3ca1a56ff..0000000000 --- a/mobile/src/operators/range_op.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RANGE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/range_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(Range, RangeParam, RangeKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/reduce_prod_op.cpp b/mobile/src/operators/reduce_prod_op.cpp deleted file mode 100644 index 9eb4866d4f..0000000000 --- a/mobile/src/operators/reduce_prod_op.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef REDUCE_PROD_OP - -#include "operators/reduce_prod_op.h" -#include -#include - -namespace paddle_mobile { -namespace operators { - -template -void ReduceProdOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr, - "Input (X) of ReduceOp op should not be null."); - PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr, - "Output (Output) of ReduceOp op should not be null."); - - auto x_dims = this->param_.Input()->dims(); - auto x_rank = x_dims.size(); - PADDLE_MOBILE_ENFORCE(x_rank <= 6, - "Tensors with rank at most 6 are supported."); - auto dims = this->param_.getDim(); - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - PADDLE_MOBILE_ENFORCE( - dims[i] < x_rank, - "The dim should be in the range [-rank(input), rank(input))."); - } - sort(dims.begin(), dims.end()); - bool reduce_all = this->param_.isReduceAll(); - bool keep_dim = this->param_.isKeepDim(); - if (reduce_all) { - if (keep_dim) - this->param_.Output()->Resize( - framework::make_ddim(std::vector(x_rank, 1))); - else - this->param_.Output()->Resize({1}); - } else { - auto dims_vector = vectorize(x_dims); - if (keep_dim) { - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = 1; - } - } else { - const int kDelFlag = -2; - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - } - auto out_dims = framework::make_ddim(dims_vector); - this->param_.Output()->Resize(out_dims); - if (std::is_same, Dtype>::value) { - if (dims[0] != 0) { - // Only pass LoD when not reducing on the first dim. - this->param_.Output()->set_lod(this->param_.Input()->lod()); - } - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(reduce_prod, ops::ReduceProdOp); -#endif - -#endif // REDUCE_PROD_OP diff --git a/mobile/src/operators/reduce_prod_op.h b/mobile/src/operators/reduce_prod_op.h deleted file mode 100644 index 46af419d25..0000000000 --- a/mobile/src/operators/reduce_prod_op.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef REDUCE_PROD_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/reduce_prod_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -DECLARE_OPERATOR(ReduceProd, ReduceProdParam, ReduceProdKernel); - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/reshape2_op.cpp b/mobile/src/operators/reshape2_op.cpp deleted file mode 100644 index 29712e1818..0000000000 --- a/mobile/src/operators/reshape2_op.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE2_OP - -#include "operators/reshape2_op.h" -#include -#include "operators/kernel/reshape_kernel.h" -namespace paddle_mobile { -namespace operators { - -template -void Reshape2Op::InferShape() const { - if (this->param_.InputShape() != nullptr) { - return; - } - auto &shape = this->param_.Shape(); - auto input_x_dims = this->param_.InputX()->dims(); - bool shouldResize = true; - if (std::is_same, Dtype>::value) { - auto input_dim_size = input_x_dims.size(); - if (input_dim_size > 4) { - for (int i = 0; i < input_dim_size - 4; ++i) { - if (input_x_dims[i] != 0 && input_x_dims[i] != 1) { - shouldResize = false; - break; - } - } - if (shouldResize) { - std::vector temp_intput_dims; - temp_intput_dims.reserve(static_cast(4)); - for (int i = input_dim_size - 4; i < input_dim_size; ++i) { - temp_intput_dims.push_back(input_x_dims[i]); - } - framework::DDim temp_ddim = framework::make_ddim(temp_intput_dims); - this->param_.InputX()->Resize(temp_ddim); - input_x_dims = this->param_.InputX()->dims(); - } - } - } - - auto out_dims = ValidateShape(shape, input_x_dims); - this->param_.Out()->Resize(out_dims); - if (std::is_same, Dtype>::value) { - input_x_dims = this->param_.InputX()->dims(); - shouldResize = true; - if (out_dims.size() > 4) { - for (int i = 0; i < out_dims.size() - 4; ++i) { - if (out_dims[i] != 0 && out_dims[i] != 1) { - shouldResize = false; - break; - } - } - if (shouldResize) { - std::vector temp_output_dims; - temp_output_dims.reserve(static_cast(4)); - for (int i = out_dims.size() - 4; i < out_dims.size(); ++i) { - temp_output_dims.push_back(out_dims[i]); - } - framework::DDim temp_ddim = framework::make_ddim(temp_output_dims); - this->param_.Out()->Resize(temp_ddim); - } - } - } - std::vector xshape_dims(input_x_dims.size() + 1, 0); - for (int i = 0; i < input_x_dims.size(); ++i) { - xshape_dims[i + 1] = input_x_dims[i]; - } - this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims)); - if (std::is_same, Dtype>::value) { - this->param_.OutputXShape()->Resize(input_x_dims); - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(reshape2, ops::Reshape2Op); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op); -#endif - -#endif diff --git a/mobile/src/operators/reshape2_op.h b/mobile/src/operators/reshape2_op.h deleted file mode 100644 index 19c5e59f71..0000000000 --- a/mobile/src/operators/reshape2_op.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE2_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/reshape2_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class Reshape2Op : public framework::OperatorWithKernel< - DeviceType, Reshape2Param, - operators::Reshape2Kernel> { - public: - Reshape2Op(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::Reshape2Kernel>( - type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, Reshape2Param, - operators::Reshape2Kernel>::OperatorWithKernel; - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/reshape_op.cpp b/mobile/src/operators/reshape_op.cpp deleted file mode 100644 index a58a607207..0000000000 --- a/mobile/src/operators/reshape_op.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE_OP - -#include "operators/reshape_op.h" -#include -namespace paddle_mobile { -namespace operators { - -template -void ReshapeOp::InferShape() const { - /// todo: add InputShape() detection. - auto &shape = this->param_.Shape(); - auto input_x_dims = this->param_.InputX()->dims(); - auto out_dims = ValidateShape(shape, input_x_dims); - this->param_.Out()->Resize(out_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp); -#endif - -#endif diff --git a/mobile/src/operators/reshape_op.h b/mobile/src/operators/reshape_op.h deleted file mode 100644 index 67e86044ea..0000000000 --- a/mobile/src/operators/reshape_op.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESHAPE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/reshape_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class ReshapeOp : public framework::OperatorWithKernel< - DeviceType, ReshapeParam, - operators::ReshapeKernel> { - public: - ReshapeOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ReshapeKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/resize_op.cpp b/mobile/src/operators/resize_op.cpp deleted file mode 100644 index fcdf59b473..0000000000 --- a/mobile/src/operators/resize_op.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESIZE_OP - -#include "operators/resize_op.h" -#include -namespace paddle_mobile { -namespace operators { - -template -void ResizeOp::InferShape() const { - auto out_dims = CalOutputShape(this->param_); - this->param_.Out()->Resize(out_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(resize, ops::ResizeOp); -#endif - -#endif diff --git a/mobile/src/operators/resize_op.h b/mobile/src/operators/resize_op.h deleted file mode 100644 index 6088ad4f51..0000000000 --- a/mobile/src/operators/resize_op.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef RESIZE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/resize_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class ResizeOp : public framework::OperatorWithKernel< - DeviceType, ResizeParam, - operators::ResizeKernel> { - public: - ResizeOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ResizeKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/scale_op.cpp b/mobile/src/operators/scale_op.cpp deleted file mode 100644 index 4236d1203b..0000000000 --- a/mobile/src/operators/scale_op.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SCALE_OP - -#include "operators/scale_op.h" -#include -namespace paddle_mobile { -namespace operators { - -template -void ScaleOp::InferShape() const { - auto input_dims = this->param_.InputX()->dims(); - this->param_.Out()->Resize(input_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(scale, ops::ScaleOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(scale, ops::ScaleOp); -#endif -#endif diff --git a/mobile/src/operators/scale_op.h b/mobile/src/operators/scale_op.h deleted file mode 100644 index aacacd9245..0000000000 --- a/mobile/src/operators/scale_op.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SCALE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/scale_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class ScaleOp : public framework::OperatorWithKernel< - DeviceType, ScaleParam, - operators::ScaleKernel> { - public: - ScaleOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ScaleKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/sequence_ops/sequence_expand_op.cpp b/mobile/src/operators/sequence_ops/sequence_expand_op.cpp deleted file mode 100644 index a1ff839813..0000000000 --- a/mobile/src/operators/sequence_ops/sequence_expand_op.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_EXPAND_OP - -#include "operators/sequence_ops/sequence_expand_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void SequenceExpandOp::InferShape() const { - const auto *input_x = this->param_.input_x_; - const auto *input_y = this->param_.input_y_; - const auto &x_lod = input_x->lod(); - const auto &y_lod = input_y->lod(); - int ref_level = this->param_.ref_level_; - if (ref_level == -1) ref_level = y_lod.size() - 1; - - auto out_dims = input_x->dims(); - int64_t out_first_dim = 0; - - if (y_lod[ref_level].size() > 1) { - for (size_t i = 1; i < y_lod[ref_level].size(); ++i) { - int x_seq_len = 1; - if (x_lod.size() == 1) { - x_seq_len = x_lod[0][i] - x_lod[0][i - 1]; - } - out_first_dim += - (y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len; - } - out_dims[0] = out_first_dim; - } - this->param_.output_->Resize(out_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(sequence_expand, ops::SequenceExpandOp); -#endif - -#endif // SEQUENCE_EXPAND_OP diff --git a/mobile/src/operators/sequence_ops/sequence_expand_op.h b/mobile/src/operators/sequence_ops/sequence_expand_op.h deleted file mode 100644 index f854272d7b..0000000000 --- a/mobile/src/operators/sequence_ops/sequence_expand_op.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_EXPAND_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/sequence_kernels.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class SequenceExpandOp : public framework::OperatorWithKernel< - DeviceType, SequenceExpandParam, - operators::SequenceExpandKernel> { - public: - SequenceExpandOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, SequenceExpandParam, - operators::SequenceExpandKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // SEQUENCE_EXPAND_OP diff --git a/mobile/src/operators/sequence_ops/sequence_pool_op.cpp b/mobile/src/operators/sequence_ops/sequence_pool_op.cpp deleted file mode 100644 index 4165d8ef60..0000000000 --- a/mobile/src/operators/sequence_ops/sequence_pool_op.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_POOL_OP - -#include "operators/sequence_ops/sequence_pool_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void SequencePoolOp::InferShape() const { - const auto *input = this->param_.input_; - auto out_dims = input->dims(); - out_dims[0] = input->lod()[0].size() - 1; - this->param_.output_->Resize(out_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(sequence_pool, ops::SequencePoolOp); -#endif - -#endif // SEQUENCE_POOL_OP diff --git a/mobile/src/operators/sequence_ops/sequence_pool_op.h b/mobile/src/operators/sequence_ops/sequence_pool_op.h deleted file mode 100644 index aae892f9f3..0000000000 --- a/mobile/src/operators/sequence_ops/sequence_pool_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_POOL_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/sequence_kernels.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class SequencePoolOp : public framework::OperatorWithKernel< - DeviceType, SequencePoolParam, - operators::SequencePoolKernel> { - public: - SequencePoolOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, SequencePoolParam, - operators::SequencePoolKernel>(type, inputs, outputs, - attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // SEQUENCE_POOL_OP diff --git a/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp b/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp deleted file mode 100644 index 602e0d2975..0000000000 --- a/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_SOFTMAX_OP - -#include "operators/sequence_ops/sequence_softmax_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void SequenceSoftmaxOp::InferShape() const { - const auto *input_x = this->param_.InputX(); - const auto &x_lod = input_x->lod(); - - this->param_.Out()->Resize(input_x->dims()); - this->param_.Out()->set_lod(input_x->lod()); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(sequence_softmax, ops::SequenceSoftmaxOp); -#endif - -#endif // SEQUENCE_SOFTMAX_OP diff --git a/mobile/src/operators/sequence_ops/sequence_softmax_op.h b/mobile/src/operators/sequence_ops/sequence_softmax_op.h deleted file mode 100644 index f0578f6ed3..0000000000 --- a/mobile/src/operators/sequence_ops/sequence_softmax_op.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SEQUENCE_SOFTMAX_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/sequence_kernels.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class SequenceSoftmaxOp : public framework::OperatorWithKernel< - DeviceType, SoftmaxParam, - operators::SequenceSoftmaxKernel> { - public: - SequenceSoftmaxOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, SoftmaxParam, - operators::SequenceSoftmaxKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // SEQUENCE_SOFTMAX_OP diff --git a/mobile/src/operators/shape_op.cpp b/mobile/src/operators/shape_op.cpp deleted file mode 100644 index f3ef72c16f..0000000000 --- a/mobile/src/operators/shape_op.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SHAPE_OP - -#include "operators/shape_op.h" - -namespace paddle_mobile { -namespace operators { -template -void ShapeOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr, - "Input (Input) of get_shape op should not be null."); - PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr, - "Output (Out) of get_shape op should not be null."); - this->param_.Out()->Resize({this->param_.Input()->dims().size()}); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(shape, ops::ShapeOp); -#endif - -#endif diff --git a/mobile/src/operators/shape_op.h b/mobile/src/operators/shape_op.h deleted file mode 100644 index 05bc611bc5..0000000000 --- a/mobile/src/operators/shape_op.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SHAPE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/shape_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class ShapeOp : public framework::OperatorWithKernel< - DeviceType, ShapeParam, - operators::ShapeKernel> { - public: - ShapeOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::ShapeKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/slice_op.cpp b/mobile/src/operators/slice_op.cpp deleted file mode 100644 index 29fe870ae3..0000000000 --- a/mobile/src/operators/slice_op.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SLICE_OP - -#include "operators/slice_op.h" -#include -#include - -namespace paddle_mobile { -namespace operators { - -template -void SliceOp::InferShape() const { - auto axes = this->param_.axes_; - auto input = this->param_.input_; - auto output = this->param_.output_; - if (std::is_same, Dtype>::value) { - auto output_dims = output->dims(); - auto output_dims_size = output_dims.size(); - bool should_resize = true; - if (output_dims_size > 4) { - for (int i = 0; i < output_dims_size - 4; ++i) { - if (output_dims[i] != 0 && output_dims[i] != 1) { - should_resize = false; - break; - } - } - if (should_resize) { - std::vector temp_output_dims; - temp_output_dims.reserve(static_cast(4)); - for (int i = output_dims_size - 4; i < output_dims_size; ++i) { - temp_output_dims.push_back(output_dims[i]); - } - framework::DDim temp_ddim = framework::make_ddim(temp_output_dims); - this->param_.output_->Resize(temp_ddim); - } - } - } - PADDLE_MOBILE_ENFORCE(axes.size() == 1, "axes size should equals 1"); - PADDLE_MOBILE_ENFORCE(input->dims().size() == output->dims().size(), - "input dim size should equals output dim size"); - if (std::is_same, Dtype>::value) { - PADDLE_MOBILE_ENFORCE( - output->dims().size() - - (axes[0] - (this->param_.original_output_dims_size_ - - this->param_.output_->dims().size())) == - 3, - "op only support slice channel now"); - } - auto starts = this->param_.starts_; - auto ends = this->param_.ends_; - framework::DDim out_dims(input->dims()); - PADDLE_MOBILE_ENFORCE(starts.size() == ends.size(), - "starts.size should equal ends.size"); - PADDLE_MOBILE_ENFORCE(axes.size() == starts.size(), - "axes.size should equal starts.size"); - int dim_value, start, end; - for (size_t i = 0; i < axes.size(); ++i) { - int axis = axes[i] - (this->param_.original_output_dims_size_ - - this->param_.output_->dims().size()); - dim_value = out_dims[axis]; - if (dim_value > 0) { - start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; - end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; - start = std::max(start, 0); - end = std::max(end, 0); - // start = std::min(start, dim_value); - end = std::min(end, dim_value); - // start = std::min(start, end); - PADDLE_MOBILE_ENFORCE(end > start, "end should greater than start"); - out_dims[axis] = end - start; - } - } - output->Resize(out_dims); - if (std::is_same, Dtype>::value) { - LoDTensor *output_lod = reinterpret_cast(output); - LoDTensor *input_lod = reinterpret_cast(input); - if (axes[0] != 0) { - output_lod->set_lod(input_lod->lod()); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(slice, ops::SliceOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(slice, ops::SliceOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(slice, ops::SliceOp); -#endif -#endif // SLICE_OP diff --git a/mobile/src/operators/slice_op.h b/mobile/src/operators/slice_op.h deleted file mode 100644 index 0d01705f7d..0000000000 --- a/mobile/src/operators/slice_op.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SLICE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/slice_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class SliceOp : public framework::OperatorWithKernel< - DeviceType, SliceParam, - operators::SliceKernel> { - public: - SliceOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::SliceKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/softmax_op.cpp b/mobile/src/operators/softmax_op.cpp deleted file mode 100644 index d88fc0a9f1..0000000000 --- a/mobile/src/operators/softmax_op.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#include "operators/softmax_op.h" - -namespace paddle_mobile { -namespace operators { -template -void SoftmaxOp::InferShape() const { - this->param_.Out()->Resize(this->param_.InputX()->dims()); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp); -#endif -#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD) -REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(softmax, ops::SoftmaxOp); -#endif - -#endif diff --git a/mobile/src/operators/softmax_op.h b/mobile/src/operators/softmax_op.h deleted file mode 100644 index 2f9285a21d..0000000000 --- a/mobile/src/operators/softmax_op.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SOFTMAX_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/softmax_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -template -class SoftmaxOp : public framework::OperatorWithKernel< - DeviceType, SoftmaxParam, - operators::SoftmaxKernel> { - public: - SoftmaxOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel, - operators::SoftmaxKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - private: -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/split_op.cpp b/mobile/src/operators/split_op.cpp deleted file mode 100644 index ec82214a48..0000000000 --- a/mobile/src/operators/split_op.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SPLIT_OP -#include "operators/split_op.h" -#include - -namespace paddle_mobile { -namespace operators { - -template -void SplitOp::InferShape() const { - PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr, - "Input(X) of SplitOp should not be null."); - // std::string str; - // str.size() - const auto &outs = this->param_.Outs(); - PADDLE_MOBILE_ENFORCE(outs.size() >= 1UL, - "Outputs(Out) of SplitOp should not be empty."); - - auto in_dims = this->param_.InputX()->dims(); - size_t axis = static_cast(this->param_.Axis()); - size_t num = static_cast(this->param_.Num()); - - const auto §ions = this->param_.Sections(); - - const size_t outs_number = outs.size(); - std::vector outs_dims; - outs_dims.reserve(outs_number); - - if (num > 0) { - int64_t in_axis_dim = in_dims[axis]; - PADDLE_MOBILE_ENFORCE(in_axis_dim % num == 0, - "tensor split does not result" - " in an equal division"); - size_t out_axis_dim = in_axis_dim / num; - for (size_t i = 0; i < outs_number; ++i) { - auto dim = in_dims; - dim[axis] = out_axis_dim; - outs_dims.push_back(dim); - } - } else if (sections.size() > 0) { - PADDLE_MOBILE_ENFORCE(sections.size() == outs_number, - "tensor split sections size" - "should be equal to output size."); - for (size_t i = 0; i < outs_number; ++i) { - auto dim = in_dims; - dim[axis] = sections[i]; - outs_dims.push_back(dim); - } - } - - PADDLE_MOBILE_ENFORCE(outs_dims.size() == outs.size(), - "length==dims.size() must be true!"); - for (int j = 0; j < outs_dims.size(); ++j) { - outs[j]->Resize(outs_dims[j]); - } - - // todo lod impl - // if (axis != 0) { - // // Only pass LoD when not spliting along the first dim. - // for (size_t i = 0; i < outs_number; ++i) { - // ctx->ShareLoD("X", "Out", 0, i); - // } - // } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(split, ops::SplitOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(split, ops::SplitOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(split, ops::SplitOp); -#endif - -#endif // SPLIT_OP diff --git a/mobile/src/operators/split_op.h b/mobile/src/operators/split_op.h deleted file mode 100644 index 4801defb49..0000000000 --- a/mobile/src/operators/split_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SPLIT_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/split_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class SplitOp : public framework::OperatorWithKernel< - DeviceType, SplitParam, - operators::SplitKernel> { - public: - SplitOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::SplitKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; -}; -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/sum_op.cpp b/mobile/src/operators/sum_op.cpp deleted file mode 100644 index 1049edcbd5..0000000000 --- a/mobile/src/operators/sum_op.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SUM_OP - -#include - -#include "operators/sum_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void SumOp::InferShape() const { - auto inputs = this->param_.Inputs(); - const size_t n = inputs.size(); - - std::vector inputs_dims; - inputs_dims.reserve(n); - for (int i = 0; i < n; i++) { - inputs_dims.push_back(inputs[i]->dims()); - } - - if (n == 1) { - DLOG << "Warning: sum op have only one input, " - "may waste memory"; - } - - framework::DDim in_dim({0}); - - for (auto& x_dim : inputs_dims) { - if (framework::product(x_dim) == 0) { - continue; - } - if (framework::product(in_dim) == 0) { - in_dim = x_dim; - } else { - PADDLE_MOBILE_ENFORCE(in_dim == x_dim, - "input tensors must have same shape"); - } - } - - this->param_.Out()->Resize(in_dim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(sum, ops::SumOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -#endif - -#endif diff --git a/mobile/src/operators/sum_op.h b/mobile/src/operators/sum_op.h deleted file mode 100644 index 3ee5465fc8..0000000000 --- a/mobile/src/operators/sum_op.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef SUM_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/sum_kernel.h" -#include "operators/op_param.h" -namespace paddle_mobile { -namespace operators { -using std::string; -template -class SumOp : public framework::OperatorWithKernel< - DeviceType, SumParam, - operators::SumKernel> { - public: - SumOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::SumKernel>( - type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SumParam, - operators::SumKernel>::OperatorWithKernel; - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/top_k_op.cpp b/mobile/src/operators/top_k_op.cpp deleted file mode 100644 index c27b24d7e8..0000000000 --- a/mobile/src/operators/top_k_op.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TOP_K_OP - -#include "operators/top_k_op.h" - -namespace paddle_mobile { -namespace operators { - -template -void TopKOp::InferShape() const { - const int k = this->param_.k_; - auto dims = this->param_.input_->dims(); - // should check k <= dims[-1] && k >= 1 - dims[dims.size() - 1] = k; - this->param_.output_->Resize(dims); - this->param_.indices_->Resize(dims); - if (std::is_same, Dtype>::value) { - this->param_.output_->set_lod(this->param_.input_->lod()); - this->param_.indices_->set_lod(this->param_.input_->lod()); - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(top_k, ops::TopKOp); -#endif - -#endif // TOP_K_OP diff --git a/mobile/src/operators/top_k_op.h b/mobile/src/operators/top_k_op.h deleted file mode 100644 index 4c182d6ffe..0000000000 --- a/mobile/src/operators/top_k_op.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TOP_K_OP - -#pragma once - -#include -#include "framework/operator.h" -#include "operators/kernel/kernels.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -class TopKOp : public framework::OperatorWithKernel< - DeviceType, TopKParam, - operators::TopKKernel> { - public: - TopKOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel, - operators::TopKKernel>( - type, inputs, outputs, attrs, scope) {} - // inference output shape - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif // TOP_K_OP diff --git a/mobile/src/operators/transpose2_op.cpp b/mobile/src/operators/transpose2_op.cpp deleted file mode 100644 index ca9ceaafbd..0000000000 --- a/mobile/src/operators/transpose2_op.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TRANSPOSE2_OP - -#include - -#include "common/enforce.h" -#include "operators/transpose2_op.h" -namespace paddle_mobile { -namespace operators { - -template -void Transpose2Op::InferShape() const { - auto input_x_dims = this->param_.InputX()->dims(); - auto axis = this->param_.Axis(); - - size_t x_dims_size = input_x_dims.size(); - size_t axis_size = axis.size(); - - if (std::is_same, Dtype>::value) { - bool shouldResize = true; - int diff_dim = 0; - if (axis_size > 4) { - for (int i = 0; i < axis_size - 4; ++i) { - if (axis[i] != i) { - shouldResize = false; - break; - } else { - diff_dim++; - } - } - if (shouldResize) { - std::vector temp_axis_dims; - temp_axis_dims.reserve(static_cast(4)); - for (int i = axis_size - 4; i < axis_size; ++i) { - temp_axis_dims.push_back(axis[i] - diff_dim); - } - axis.resize(4); - axis.clear(); - axis.insert(axis.begin(), temp_axis_dims.begin(), temp_axis_dims.end()); - } - } - - auto input_dim_size = input_x_dims.size(); - shouldResize = true; - if (input_dim_size > 4) { - for (int i = 0; i < input_dim_size - 4; ++i) { - if (input_x_dims[i] != 0 && input_x_dims[i] != 1) { - shouldResize = false; - break; - } - } - if (shouldResize) { - std::vector temp_intput_dims; - temp_intput_dims.reserve(static_cast(4)); - for (int i = input_dim_size - 4; i < input_dim_size; ++i) { - temp_intput_dims.push_back(input_x_dims[i]); - } - framework::DDim temp_ddim = framework::make_ddim(temp_intput_dims); - this->param_.InputX()->Resize(temp_ddim); - } - } - - axis_size = axis.size(); - input_x_dims = this->param_.InputX()->dims(); - x_dims_size = input_x_dims.size(); - } - - PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size), - "input_dims must " - "be equal to the axis_size. ") - - std::vector count(axis_size, 0); - for (size_t i = 0; i < axis_size; i++) { - PADDLE_MOBILE_ENFORCE( - axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, - "Each element of Attribute axis should be a unique value " - "range from 0 to (dims - 1), " - "where the dims is the axis's size"); - } - framework::DDim out_dims(input_x_dims); - for (size_t i = 0; i < axis_size; i++) { - out_dims[i] = input_x_dims[axis[i]]; - } - this->param_.Out()->Resize(out_dims); - std::vector xshape_dims(input_x_dims.size() + 1, 0); - for (int i = 0; i < input_x_dims.size(); ++i) { - xshape_dims[i + 1] = input_x_dims[i]; - } - this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims)); - if (std::is_same, Dtype>::value) { - this->param_.OutputXShape()->Resize(input_x_dims); - } -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(transpose2, ops::Transpose2Op); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(transpose2, ops::Transpose2Op); -#endif -#endif // TRANSPOSE_OP diff --git a/mobile/src/operators/transpose2_op.h b/mobile/src/operators/transpose2_op.h deleted file mode 100644 index 2552688ca6..0000000000 --- a/mobile/src/operators/transpose2_op.h +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TRANSPOSE2_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/transpose2_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class Transpose2Op : public framework::OperatorWithKernel< - DeviceType, Transpose2Param, - operators::Transpose2Kernel> { - public: - Transpose2Op(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, Transpose2Param, - operators::Transpose2Kernel>(type, inputs, outputs, - attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, Transpose2Param, - operators::Transpose2Kernel>::OperatorWithKernel; - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/operators/transpose_op.cpp b/mobile/src/operators/transpose_op.cpp deleted file mode 100644 index 820a4e354d..0000000000 --- a/mobile/src/operators/transpose_op.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TRANSPOSE_OP - -#include - -#include "common/enforce.h" -#include "operators/transpose_op.h" -namespace paddle_mobile { -namespace operators { - -template -void TransposeOp::InferShape() const { - auto input_x_dims = this->param_.InputX()->dims(); - auto axis = this->param_.Axis(); - - size_t x_dims_size = input_x_dims.size(); - size_t axis_size = axis.size(); - - PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size), - "input_dims must " - "be equal to the axis_size. ") - - std::vector count(axis_size, 0); - for (size_t i = 0; i < axis_size; i++) { - PADDLE_MOBILE_ENFORCE( - axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, - "Each element of Attribute axis should be a unique value " - "range from 0 to (dims - 1), " - "where the dims is the axis's size"); - } - framework::DDim out_dims(input_x_dims); - for (size_t i = 0; i < axis_size; i++) { - out_dims[i] = input_x_dims[axis[i]]; - } - this->param_.Out()->Resize(out_dims); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp); -#endif -#ifdef PADDLE_MOBILE_CL -REGISTER_OPERATOR_CL(transpose, ops::TransposeOp); -#endif - -#endif // TRANSPOSE_OP diff --git a/mobile/src/operators/transpose_op.h b/mobile/src/operators/transpose_op.h deleted file mode 100644 index cf03cb3825..0000000000 --- a/mobile/src/operators/transpose_op.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef TRANSPOSE_OP - -#pragma once - -#include - -#include "framework/operator.h" -#include "operators/kernel/transpose_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using paddle_mobile::framework::Tensor; - -template -class TransposeOp : public framework::OperatorWithKernel< - DeviceType, TransposeParam, - operators::TransposeKernel> { - public: - TransposeOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, TransposeParam, - operators::TransposeKernel>(type, inputs, outputs, - attrs, scope) {} - void InferShape() const override; -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/mobile/src/pass/memory_optimize.cpp b/mobile/src/pass/memory_optimize.cpp deleted file mode 100644 index d9cfa13899..0000000000 --- a/mobile/src/pass/memory_optimize.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "pass/memory_optimize.h" -#include -#include "framework/lod_tensor.h" - -namespace paddle_mobile { -namespace pass { - -void MemoryOptPass::AppendBlockVars(const framework::BlockDesc *block) { - // block_vars_.clear(); - for (const auto var : block->Vars()) { - block_vars_[var->Name()] = var.get(); - } -} - -bool MemoryOptPass::IsPersistable(const std::string name) { - const auto it = block_vars_.find(name); - if (it != block_vars_.end()) { - return it->second->Persistable(); - } - return false; -} - -VarNode *MemoryOptPass::CreateNode(const std::string name) { - auto it = created_nodes_.find(name); - if (it != created_nodes_.end()) { - ++(it->second->count); - return it->second; - } - VarNode *var = new VarNode; - var->name = name; - var->count = 1; - var->visited = false; - created_nodes_[name] = var; - return var; -} - -void MemoryOptPass::operator()( - const framework::ProgramDesc *program, framework::Scope *scope, - MemoryOptimizationLevel memory_optimization_level) { - const auto &blocks = program->Blocks(); - for (const auto &block : blocks) { - // access all variables in each block - AppendBlockVars(block.get()); - - reused_nodes_.clear(); - // collect all not persistable variables, and accumulate - // it's reference count - std::stack empty_var_nodes; - analysis_nodes_.swap(empty_var_nodes); - - std::vector exclude_var_names; - for (const auto &op : block->Ops()) { - for (const auto &inputs : op->GetInputs()) { - for (const auto &input : inputs.second) { - if (!IsPersistable(input)) { - if (memory_optimization_level == MemoryOptimizationWithoutFeeds) { - if (op->Type() == "feed") { - exclude_var_names.push_back(input); - } - } - } - } - } - } - - std::vector fetch_var_nodes; - for (const auto &op : block->Ops()) { - DLOG << "op_desc->Type(): " << op->Type(); - for (const auto &outputs : op->GetOutputs()) { - for (const auto &output : outputs.second) { - if (!IsPersistable(output) && - std::find(exclude_var_names.begin(), exclude_var_names.end(), - output) == exclude_var_names.end()) { - DLOG << "output: " << output; - VarNode *node = CreateNode(output); - analysis_nodes_.push(node); - } - } - } - for (const auto &inputs : op->GetInputs()) { - for (const auto &input : inputs.second) { - if (!IsPersistable(input) && - std::find(exclude_var_names.begin(), exclude_var_names.end(), - input) == exclude_var_names.end()) { - DLOG << "input: " << input; - VarNode *node = CreateNode(input); - analysis_nodes_.push(node); - if (op->Type() == "fetch") { - fetch_var_nodes.push_back(node); - } - } - } - } - for (const auto &outputs : op->GetOutputs()) { - for (const auto &output : outputs.second) { - if (!IsPersistable(output) && - std::find(exclude_var_names.begin(), exclude_var_names.end(), - output) == exclude_var_names.end()) { - DLOG << "output: " << output; - VarNode *node = CreateNode(output); - analysis_nodes_.push(node); - } - } - } - } - - // apply optimize - while (!analysis_nodes_.empty()) { - auto *node = analysis_nodes_.top(); - analysis_nodes_.pop(); - // only not visited node can reuse memory between other nodes - // with 0 count which indicate they will not be used any more - if (!node->visited) { - bool reused = false; - // find out a possable reuse list - for (auto &list : reused_nodes_) { - if (list.back()->count == 0 && - std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(), - list.back()) == fetch_var_nodes.end()) { - list.push_back(node); - reused = true; - break; - } - } - // create new list if can't find a reused list - if (!reused) { - std::vector list; - list.push_back(node); - reused_nodes_.push_back(std::move(list)); - } - } - node->visited = true; - node->count -= 1; - } - - // shared data within all variables in the same reused list - for (const auto &list : reused_nodes_) { - DLOG << "\n"; - DLOG << "share memory within these variables"; - std::string name = list[0]->name; - auto *reused_var = scope->Var(name); - auto *reuse_tensor = - reused_var->template GetMutable(); - reuse_tensor->mutable_data(); - for (const auto &node : list) { - DLOG << node->name; - auto *var = scope->Var(node->name); - auto *tensor = var->template GetMutable(); - tensor->ShareHolderWith(*reuse_tensor); - } - } - } -} - -} // namespace pass -} // namespace paddle_mobile diff --git a/mobile/src/pass/memory_optimize.h b/mobile/src/pass/memory_optimize.h deleted file mode 100644 index f0171c5ba6..0000000000 --- a/mobile/src/pass/memory_optimize.h +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include "framework/program/program.h" -#include "pass/pass_base.h" - -namespace paddle_mobile { -namespace pass { - -typedef struct { - std::string name; // variable name - int count; // reference count - bool visited; -} VarNode; - -// MemoryOptPass will analyze the program, and reuse memory between -// variables as much as possible -class MemoryOptPass : public PassBase { - public: - MemoryOptPass() {} - virtual ~MemoryOptPass() { - for (auto &it : created_nodes_) { - delete it.second; - } - } - - void operator()(const framework::ProgramDesc *program, - framework::Scope *scope, - MemoryOptimizationLevel memory_optimization_level); - - void AppendBlockVars(const framework::BlockDesc *block); - - bool IsPersistable(const std::string name); - - VarNode *CreateNode(const std::string name); - - private: - std::stack analysis_nodes_; - std::vector> reused_nodes_; - std::unordered_map created_nodes_; - std::unordered_map block_vars_; -}; - -} // namespace pass -} // namespace paddle_mobile diff --git a/mobile/src/pass/memory_optimize_cl.cpp b/mobile/src/pass/memory_optimize_cl.cpp deleted file mode 100644 index 53bb675f17..0000000000 --- a/mobile/src/pass/memory_optimize_cl.cpp +++ /dev/null @@ -1,270 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_MOBILE_CL -#include "pass/memory_optimize_cl.h" -#include -#include -#include "framework/cl/cl_image.h" -#include "framework/lod_tensor.h" -namespace paddle_mobile { -namespace pass { - -void MemoryOptPassCl::AppendBlockVars(const framework::BlockDesc *block) { - // block_vars_.clear(); - for (const auto var : block->Vars()) { - block_vars_[var->Name()] = var.get(); - } -} - -bool MemoryOptPassCl::IsPersistable(const std::string name) { - const auto it = block_vars_.find(name); - if (it != block_vars_.end()) { - return it->second->Persistable(); - } - return false; -} - -ClVarNode *MemoryOptPassCl::CreateNode(const std::string name) { - auto it = created_nodes_.find(name); - if (it != created_nodes_.end()) { - ++(it->second->count); - return it->second; - } - ClVarNode *var = new ClVarNode; - var->name = name; - var->count = 1; - var->visited = false; - created_nodes_[name] = var; - return var; -} - -void MemoryOptPassCl::operator()( - const framework::ProgramDesc *program, framework::Scope *scope, - MemoryOptimizationLevel memory_optimization_level, - framework::DDim target_dims) { - const auto &blocks = program->Blocks(); - for (const auto &block : blocks) { - // access all variables in each block - AppendBlockVars(block.get()); - reused_nodes_.clear(); - // collect all not persistable variables, and accumulate - // it's reference count - std::stack empty_var_nodes; - analysis_nodes_.swap(empty_var_nodes); - - std::vector exclude_var_names; - for (const auto &op : block->Ops()) { - for (const auto &inputs : op->GetInputs()) { - for (const auto &input : inputs.second) { - if (!IsPersistable(input)) { - if (memory_optimization_level == MemoryOptimizationWithoutFeeds) { - if (op->Type() == "feed") { - exclude_var_names.push_back(input); - } - } - } - } - } - } - - std::vector fetch_var_nodes; - for (const auto &op : block->Ops()) { - LOG(kNO_LOG) << "op_desc->Type(): " << op->Type(); - for (const auto &outputs : op->GetOutputs()) { - for (const auto &output : outputs.second) { - // not a persistable and not a exclude one ,then add it to - // analysis_nodes - if (!IsPersistable(output) && - std::find(exclude_var_names.begin(), exclude_var_names.end(), - output) == exclude_var_names.end()) { - LOG(kNO_LOG) << "output: " << output; - ClVarNode *node = CreateNode(output); - analysis_nodes_.push(node); - } - } - } - for (const auto &inputs : op->GetInputs()) { - for (const auto &input : inputs.second) { - // not a persistable and not a exclude one ,then add it to - // analysis_nodes - if (!IsPersistable(input) && - std::find(exclude_var_names.begin(), exclude_var_names.end(), - input) == exclude_var_names.end()) { - LOG(kNO_LOG) << "input: " << input; - ClVarNode *node = CreateNode(input); - analysis_nodes_.push(node); - if (op->Type() == "fetch") { - fetch_var_nodes.push_back(node); - } - } - } - } - for (const auto &outputs : op->GetOutputs()) { - for (const auto &output : outputs.second) { - if (!IsPersistable(output) && - std::find(exclude_var_names.begin(), exclude_var_names.end(), - output) == exclude_var_names.end()) { - LOG(kNO_LOG) << "output: " << output; - ClVarNode *node = CreateNode(output); - analysis_nodes_.push(node); - } - } - } - } - - // apply optimize - while (!analysis_nodes_.empty()) { - auto *node = analysis_nodes_.top(); - analysis_nodes_.pop(); - // only not visited node can reuse memory between other nodes - // with 0 count which indicate they will not be used any more - if (!node->visited) { - bool reused = false; - // find out a possable reuse list - for (auto &list : reused_nodes_) { - // reference count = 0 and not in fetch list - if (list.back()->count == 0 && - std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(), - list.back()) == fetch_var_nodes.end()) { - list.push_back(node); - reused = true; - break; - } - } - // create new list if can't find a reused list - if (!reused) { - std::vector list; - list.push_back(node); - reused_nodes_.push_back(std::move(list)); - } - } - node->visited = true; - node->count -= 1; - } - // shared data within all variables in the same reused list - ShareData(scope, memory_optimization_level, target_dims); - } -} - -void MemoryOptPassCl::ShareData( - framework::Scope *scope, MemoryOptimizationLevel memory_optimization_level, - framework::DDim target_dims) - const { // shared data within all variables in the same reused list - cl_context context = scope->GetCLScpoe()->Context(); - cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue(); - - for (const auto &list : reused_nodes_) { - LOG(kNO_LOG) << "\n"; - LOG(kNO_LOG) << "gpu . share memory within these variables"; - int64_t x_based_max_numl = -1; - int64_t y_based_max_numl = -1; - int64_t x_based_max_x = -1; - int64_t x_based_max_y = -1; - int64_t y_based_max_x = -1; - int64_t y_based_max_y = -1; - - framework::CLImage *x_based_reuse_tensor = nullptr; - framework::CLImage *y_based_reuse_tensor = nullptr; - for (const auto &node : list) { - auto *var = scope->Var(node->name); - auto *tensor = var->template GetMutable(); - const int64_t numl = tensor->numel(); - auto origin_tensor_dims = tensor->dims(); - - // for super ,hack origin dims - if (target_dims.size() == 4) { - PADDLE_MOBILE_ENFORCE(origin_tensor_dims.size() == 4, - "tensor dims must be equal to 4"); - origin_tensor_dims = {origin_tensor_dims[0], origin_tensor_dims[1], - target_dims[2], target_dims[3]}; - tensor->Resize(origin_tensor_dims); - } - - const framework::DDim &image_dims = - normal_converter->InitImageDimInfoWith(origin_tensor_dims); - int64_t image_dims_x = image_dims[0]; - int64_t image_dims_y = image_dims[1]; - // classify memory into two parts - if (image_dims_x > image_dims_y) { - // choose a biggest tensor for reuse - if (x_based_max_numl < numl) { - x_based_max_numl = numl; - x_based_reuse_tensor = tensor; - } - x_based_max_x = std::max(x_based_max_x, image_dims_x); - x_based_max_y = std::max(x_based_max_y, image_dims_y); - } else { - // choose a biggest tensor for reuse - if (y_based_max_numl < numl) { - y_based_max_numl = numl; - y_based_reuse_tensor = tensor; - } - y_based_max_x = std::max(y_based_max_x, image_dims_x); - y_based_max_y = std::max(y_based_max_y, image_dims_y); - } - } - - PADDLE_MOBILE_ENFORCE( - x_based_reuse_tensor != nullptr || y_based_reuse_tensor != nullptr, - "x_based_reuse_tensor and y_based_reuse_tensor can not be null at same " - "time"); - - // init x based shared cl mem - if (x_based_reuse_tensor != nullptr) { - const framework::DDim &x_reuse_dims = x_based_reuse_tensor->dims(); - x_based_reuse_tensor->InitFakeSizeImage( - context, command_queue, x_reuse_dims, {x_based_max_x, x_based_max_y}); - } - - // init y based shared cl mem - if (y_based_reuse_tensor != nullptr) { - const framework::DDim &y_reuse_dims = y_based_reuse_tensor->dims(); - y_based_reuse_tensor->InitFakeSizeImage( - context, command_queue, y_reuse_dims, {y_based_max_x, y_based_max_y}); - } - // share mem - for (const auto &node : list) { - auto *var = scope->Var(node->name); - auto *tensor = var->template GetMutable(); - auto need_dims = tensor->dims(); - - // for super ,hack origin dims - if (target_dims.size() == 4) { - need_dims = {need_dims[0], need_dims[1], target_dims[2], - target_dims[3]}; - } - - const framework::DDim &need_image_dims = - normal_converter->InitImageDimInfoWith(need_dims); - int64_t image_dims_x = need_image_dims[0]; - int64_t image_dims_y = need_image_dims[1]; - - if (image_dims_x > image_dims_y) { - PADDLE_MOBILE_ENFORCE(x_based_reuse_tensor != nullptr, - "x_based_reuse_tensor not null here"); - tensor->InitWithExistMem(context, command_queue, need_dims, - *x_based_reuse_tensor); - } else { - PADDLE_MOBILE_ENFORCE(y_based_reuse_tensor != nullptr, - "y_based_reuse_tensor not null here"); - tensor->InitWithExistMem(context, command_queue, need_dims, - *y_based_reuse_tensor); - } - } - } -} - -} // namespace pass -} // namespace paddle_mobile -#endif diff --git a/mobile/src/pass/memory_optimize_cl.h b/mobile/src/pass/memory_optimize_cl.h deleted file mode 100644 index aafdda4b34..0000000000 --- a/mobile/src/pass/memory_optimize_cl.h +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef PADDLE_MOBILE_CL - -#pragma once - -#include -#include -#include -#include -#include "framework/cl/cl_image_converter.h" -#include "framework/lod_tensor.h" -#include "framework/program/program.h" -#include "pass/pass_base.h" - -// use for opencl -namespace paddle_mobile { -namespace pass { - -typedef struct { - std::string name; // variable name - int count; // reference count - bool visited; -} ClVarNode; - -// MemoryOptPass will analyze the program, and reuse memory between -// variables as much as possible -class MemoryOptPassCl : public PassBase { - public: - MemoryOptPassCl() {} - virtual ~MemoryOptPassCl() { - for (auto &it : created_nodes_) { - delete it.second; - } - delete normal_converter; - } - - void operator()(const framework::ProgramDesc *program, - framework::Scope *scope, - MemoryOptimizationLevel memory_optimization_level, - framework::DDim dims = {}); - - void AppendBlockVars(const framework::BlockDesc *block); - - bool IsPersistable(const std::string name); - - ClVarNode *CreateNode(const std::string name); - - void ShareData(framework::Scope *scope, - MemoryOptimizationLevel memory_optimization_level, - framework::DDim dims) const; - - private: - std::stack analysis_nodes_; - std::vector> reused_nodes_; - std::unordered_map created_nodes_; - std::unordered_map block_vars_; - paddle_mobile::framework::CLImageConverterNormal *normal_converter = - new paddle_mobile::framework::CLImageConverterNormal(); -}; - -} // namespace pass -} // namespace paddle_mobile -#endif diff --git a/mobile/src/pass/model_obfuscate.cpp b/mobile/src/pass/model_obfuscate.cpp deleted file mode 100644 index 913b93af25..0000000000 --- a/mobile/src/pass/model_obfuscate.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "pass/model_obfuscate.h" - -namespace paddle_mobile { -namespace pass { - -ModelObfuscatePass::ModelObfuscatePass(std::string key) { - for (auto c : key) { - acc *= base; - acc += (int)c; - acc %= stride; - } - acc += stride; -} - -void ModelObfuscatePass::convert_data(char *data, int len) { - for (int i = 0; i < len; i += acc) { - data[i] = 255 - data[i]; - } -} - -} // namespace pass -} // namespace paddle_mobile diff --git a/mobile/src/pass/model_obfuscate.h b/mobile/src/pass/model_obfuscate.h deleted file mode 100644 index 6c2912e05a..0000000000 --- a/mobile/src/pass/model_obfuscate.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "pass/pass_base.h" - -namespace paddle_mobile { -namespace pass { - -class ModelObfuscatePass : public PassBase { - public: - ModelObfuscatePass(std::string key); - void convert_data(char *data, int len); - int version = 1; - - private: - int acc = 0; - int base = 17; - int stride = 100; -}; - -} // namespace pass -} // namespace paddle_mobile diff --git a/mobile/src/pass/pass_base.h b/mobile/src/pass/pass_base.h deleted file mode 100644 index 925fdb7d50..0000000000 --- a/mobile/src/pass/pass_base.h +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle_mobile { -namespace pass { - -class PassBase { - public: - PassBase() {} - virtual ~PassBase() {} -}; - -} // namespace pass -} // namespace paddle_mobile diff --git a/mobile/src/protobuf-c/protobuf-c.cpp b/mobile/src/protobuf-c/protobuf-c.cpp deleted file mode 100644 index 8e739df43c..0000000000 --- a/mobile/src/protobuf-c/protobuf-c.cpp +++ /dev/null @@ -1,2249 +0,0 @@ -/* - * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/*! \file - * Support library for `protoc-c` generated code. - * - * This file implements the public API used by the code generated - * by `protoc-c`. - * - * \authors Dave Benson and the protobuf-c authors - * - * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license. - */ - -/** - * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math - * even on 64-bit platforms (uint64_size, PaddleMobile__Framework__uint64_pack, - * PaddleMobile__Framework__parse_uint64). - * - * \todo Use size_t consistently. - */ - -#include /* for malloc, free */ -#include /* for strcmp, strlen, memcpy, memmove, memset */ - -#include "protobuf-c.h" - -#define TRUE 1 -#define FALSE 0 - -#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0) - -/* Workaround for Microsoft compilers. */ -#ifdef _MSC_VER -#define inline __inline -#endif - -/** - * \defgroup internal Internal functions and macros - * - * These are not exported by the library but are useful to developers working - * on `libprotobuf-c` itself. - */ - -/** - * \defgroup macros Utility macros for manipulating structures - * - * Macros and constants used to manipulate the base "classes" generated by - * `protobuf-c`. They also define limits and check correctness. - * - * \ingroup internal - * @{ - */ - -/** The maximum length of a 64-bit integer in varint encoding. */ -#define MAX_UINT64_ENCODED_SIZE 10 - -#ifndef PROTOBUF_C_UNPACK_ERROR -#define PROTOBUF_C_UNPACK_ERROR(...) -#endif - -const char PaddleMobile__Framework__protobuf_c_empty_string[] = ""; - -/** - * Internal `PaddleMobile__Framework__ProtobufCMessage` manipulation macro. - * - * Base macro for manipulating a `PaddleMobile__Framework__ProtobufCMessage`. - * Used by STRUCT_MEMBER() and STRUCT_MEMBER_PTR(). - */ -#define STRUCT_MEMBER_P(struct_p, struct_offset) \ - ((void *)((uint8_t *)(struct_p) + (struct_offset))) - -/** - * Return field in a `PaddleMobile__Framework__ProtobufCMessage` based on - * offset. - * - * Take a pointer to a `PaddleMobile__Framework__ProtobufCMessage` and find the - * field at the offset. Cast it to the passed type. - */ -#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \ - (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset))) - -/** - * Return field in a `PaddleMobile__Framework__ProtobufCMessage` based on - * offset. - * - * Take a pointer to a `PaddleMobile__Framework__ProtobufCMessage` and find the - * field at the offset. Cast it to a pointer to the passed type. - */ -#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \ - ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset))) - -/* Assertions for magic numbers. */ - -#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \ - assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC) - -#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \ - assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) - -#define ASSERT_IS_MESSAGE(message) \ - ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor) - -#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \ - assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC) - -/**@}*/ - -/* --- version --- */ - -const char *PaddleMobile__Framework__protobuf_c_version(void) { - return PROTOBUF_C_VERSION; -} - -uint32_t PaddleMobile__Framework__protobuf_c_version_number(void) { - return PROTOBUF_C_VERSION_NUMBER; -} - -/* --- allocator --- */ - -static void *PaddleMobile__Framework__system_alloc(void *allocator_data, - size_t size) { - return malloc(size); -} - -static void PaddleMobile__Framework__system_free(void *allocator_data, - void *data) { - free(data); -} - -static inline void *PaddleMobile__Framework__do_alloc( - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t size) { - return allocator->alloc(allocator->allocator_data, size); -} - -static inline void PaddleMobile__Framework__do_free( - PaddleMobile__Framework__ProtobufCAllocator *allocator, void *data) { - if (data != NULL) allocator->free(allocator->allocator_data, data); -} - -/* - * This allocator uses the system's malloc() and free(). It is the default - * allocator used if NULL is passed as the - * PaddleMobile__Framework__ProtobufCAllocator to an exported function. - */ -static PaddleMobile__Framework__ProtobufCAllocator protobuf_c__allocator = { - .alloc = &PaddleMobile__Framework__system_alloc, - .free = &PaddleMobile__Framework__system_free, - .allocator_data = NULL, -}; - -/* === buffer-simple === */ - -void PaddleMobile__Framework__protobuf_c_buffer_simple_append( - PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len, - const uint8_t *data) { - PaddleMobile__Framework__ProtobufCBufferSimple *simp = - (PaddleMobile__Framework__ProtobufCBufferSimple *)buffer; - size_t new_len = simp->len + len; - - if (new_len > simp->alloced) { - PaddleMobile__Framework__ProtobufCAllocator *allocator = simp->allocator; - size_t new_alloced = simp->alloced * 2; - uint8_t *new_data; - - if (allocator == NULL) allocator = &protobuf_c__allocator; - while (new_alloced < new_len) new_alloced += new_alloced; - new_data = - (uint8_t *)PaddleMobile__Framework__do_alloc(allocator, new_alloced); - if (!new_data) return; - memcpy(new_data, simp->data, simp->len); - if (simp->must_free_data) - PaddleMobile__Framework__do_free(allocator, simp->data); - else - simp->must_free_data = TRUE; - simp->data = new_data; - simp->alloced = new_alloced; - } - memcpy(simp->data + simp->len, data, len); - simp->len = new_len; -} - -/** - * \defgroup packedsz - * PaddleMobile__Framework__protobuf_c_message_get_packed_size() implementation - * - * Routines mainly used by - * PaddleMobile__Framework__protobuf_c_message_get_packed_size(). - * - * \ingroup internal - * @{ - */ - -/** - * Return the number of bytes required to store the tag for the field. Includes - * 3 bits for the wire-type, and a single bit that denotes the end-of-tag. - * - * \param number - * Field tag to encode. - * \return - * Number of bytes required. - */ -static inline size_t get_tag_size(uint32_t number) { - if (number < (1UL << 4)) { - return 1; - } else if (number < (1UL << 11)) { - return 2; - } else if (number < (1UL << 18)) { - return 3; - } else if (number < (1UL << 25)) { - return 4; - } else { - return 5; - } -} - -/** - * Return the number of bytes required to store a variable-length unsigned - * 32-bit integer in base-128 varint encoding. - * - * \param v - * Value to encode. - * \return - * Number of bytes required. - */ -static inline size_t uint32_size(uint32_t v) { - if (v < (1UL << 7)) { - return 1; - } else if (v < (1UL << 14)) { - return 2; - } else if (v < (1UL << 21)) { - return 3; - } else if (v < (1UL << 28)) { - return 4; - } else { - return 5; - } -} - -/** - * Return the number of bytes required to store a variable-length signed 32-bit - * integer in base-128 varint encoding. - * - * \param v - * Value to encode. - * \return - * Number of bytes required. - */ -static inline size_t int32_size(int32_t v) { - if (v < 0) { - return 10; - } else if (v < (1L << 7)) { - return 1; - } else if (v < (1L << 14)) { - return 2; - } else if (v < (1L << 21)) { - return 3; - } else if (v < (1L << 28)) { - return 4; - } else { - return 5; - } -} - -/** - * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed - * integer. - * - * \param v - * Value to encode. - * \return - * ZigZag encoded integer. - */ -static inline uint32_t zigzag32(int32_t v) { - if (v < 0) - return (-(uint32_t)v) * 2 - 1; - else - return (uint32_t)(v)*2; -} - -/** - * Return the number of bytes required to store a signed 32-bit integer, - * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128 - * varint encoding. - * - * \param v - * Value to encode. - * \return - * Number of bytes required. - */ -static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); } - -/** - * Return the number of bytes required to store a 64-bit unsigned integer in - * base-128 varint encoding. - * - * \param v - * Value to encode. - * \return - * Number of bytes required. - */ -static inline size_t uint64_size(uint64_t v) { - uint32_t upper_v = (uint32_t)(v >> 32); - - if (upper_v == 0) { - return uint32_size((uint32_t)v); - } else if (upper_v < (1UL << 3)) { - return 5; - } else if (upper_v < (1UL << 10)) { - return 6; - } else if (upper_v < (1UL << 17)) { - return 7; - } else if (upper_v < (1UL << 24)) { - return 8; - } else if (upper_v < (1UL << 31)) { - return 9; - } else { - return 10; - } -} - -/** - * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed - * integer. - * - * \param v - * Value to encode. - * \return - * ZigZag encoded integer. - */ -static inline uint64_t zigzag64(int64_t v) { - if (v < 0) - return (-(uint64_t)v) * 2 - 1; - else - return (uint64_t)(v)*2; -} - -/** - * Return the number of bytes required to store a signed 64-bit integer, - * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128 - * varint encoding. - * - * \param v - * Value to encode. - * \return - * Number of bytes required. - */ -static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); } - -/** - * Calculate the serialized size of a single required message field, including - * the space needed by the preceding tag. - * - * \param field - * Field descriptor for member. - * \param member - * Field to encode. - * \return - * Number of bytes required. - */ -static size_t PaddleMobile__Framework__required_field_get_packed_size( - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field, - const void *member) { - size_t rv = get_tag_size(field->id); - - switch (field->type) { - case PROTOBUF_C_TYPE_SINT32: - return rv + sint32_size(*(const int32_t *)member); - case PROTOBUF_C_TYPE_ENUM: - case PROTOBUF_C_TYPE_INT32: - return rv + int32_size(*(const int32_t *)member); - case PROTOBUF_C_TYPE_UINT32: - return rv + uint32_size(*(const uint32_t *)member); - case PROTOBUF_C_TYPE_SINT64: - return rv + sint64_size(*(const int64_t *)member); - case PROTOBUF_C_TYPE_INT64: - case PROTOBUF_C_TYPE_UINT64: - return rv + uint64_size(*(const uint64_t *)member); - case PROTOBUF_C_TYPE_SFIXED32: - case PROTOBUF_C_TYPE_FIXED32: - return rv + 4; - case PROTOBUF_C_TYPE_SFIXED64: - case PROTOBUF_C_TYPE_FIXED64: - return rv + 8; - case PROTOBUF_C_TYPE_BOOL: - return rv + 1; - case PROTOBUF_C_TYPE_FLOAT: - return rv + 4; - case PROTOBUF_C_TYPE_DOUBLE: - return rv + 8; - case PROTOBUF_C_TYPE_STRING: { - const char *str = *(char *const *)member; - size_t len = str ? strlen(str) : 0; - return rv + uint32_size(len) + len; - } - case PROTOBUF_C_TYPE_BYTES: { - size_t len = - ((const PaddleMobile__Framework__ProtobufCBinaryData *)member)->len; - return rv + uint32_size(len) + len; - } - case PROTOBUF_C_TYPE_MESSAGE: { - const PaddleMobile__Framework__ProtobufCMessage *msg = - *(PaddleMobile__Framework__ProtobufCMessage *const *)member; - size_t subrv = - msg ? PaddleMobile__Framework__protobuf_c_message_get_packed_size(msg) - : 0; - return rv + uint32_size(subrv) + subrv; - } - } - PROTOBUF_C__ASSERT_NOT_REACHED(); - return 0; -} - -/** - * Calculate the serialized size of a single oneof message field, including - * the space needed by the preceding tag. Returns 0 if the oneof field isn't - * selected or is not set. - * - * \param field - * Field descriptor for member. - * \param oneof_case - * Enum value that selects the field in the oneof. - * \param member - * Field to encode. - * \return - * Number of bytes required. - */ -static size_t PaddleMobile__Framework__oneof_field_get_packed_size( - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field, - uint32_t oneof_case, const void *member) { - if (oneof_case != field->id) { - return 0; - } - if (field->type == PROTOBUF_C_TYPE_MESSAGE || - field->type == PROTOBUF_C_TYPE_STRING) { - const void *ptr = *(const void *const *)member; - if (ptr == NULL || ptr == field->default_value) return 0; - } - return PaddleMobile__Framework__required_field_get_packed_size(field, member); -} - -/** - * Calculate the serialized size of a single optional message field, including - * the space needed by the preceding tag. Returns 0 if the optional field isn't - * set. - * - * \param field - * Field descriptor for member. - * \param has - * True if the field exists, false if not. - * \param member - * Field to encode. - * \return - * Number of bytes required. - */ -static size_t PaddleMobile__Framework__optional_field_get_packed_size( - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field, - const protobuf_c_boolean has, const void *member) { - if (field->type == PROTOBUF_C_TYPE_MESSAGE || - field->type == PROTOBUF_C_TYPE_STRING) { - const void *ptr = *(const void *const *)member; - if (ptr == NULL || ptr == field->default_value) return 0; - } else { - if (!has) return 0; - } - return PaddleMobile__Framework__required_field_get_packed_size(field, member); -} - -static protobuf_c_boolean PaddleMobile__Framework__field_is_zeroish( - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field, - const void *member) { - protobuf_c_boolean ret = FALSE; - - switch (field->type) { - case PROTOBUF_C_TYPE_BOOL: - ret = (0 == *(const protobuf_c_boolean *)member); - break; - case PROTOBUF_C_TYPE_ENUM: - case PROTOBUF_C_TYPE_SINT32: - case PROTOBUF_C_TYPE_INT32: - case PROTOBUF_C_TYPE_UINT32: - case PROTOBUF_C_TYPE_SFIXED32: - case PROTOBUF_C_TYPE_FIXED32: - ret = (0 == *(const uint32_t *)member); - break; - case PROTOBUF_C_TYPE_SINT64: - case PROTOBUF_C_TYPE_INT64: - case PROTOBUF_C_TYPE_UINT64: - case PROTOBUF_C_TYPE_SFIXED64: - case PROTOBUF_C_TYPE_FIXED64: - ret = (0 == *(const uint64_t *)member); - break; - case PROTOBUF_C_TYPE_FLOAT: - ret = (0 == *(const float *)member); - break; - case PROTOBUF_C_TYPE_DOUBLE: - ret = (0 == *(const double *)member); - break; - case PROTOBUF_C_TYPE_STRING: - ret = (NULL == *(const char *const *)member) || - ('\0' == **(const char *const *)member); - break; - case PROTOBUF_C_TYPE_BYTES: - case PROTOBUF_C_TYPE_MESSAGE: - ret = (NULL == *(const void *const *)member); - break; - default: - ret = TRUE; - break; - } - - return ret; -} - -/** - * Calculate the serialized size of a single unlabeled message field, including - * the space needed by the preceding tag. Returns 0 if the field isn't set or - * if it is set to a "zeroish" value (null pointer or 0 for numerical values). - * Unlabeled fields are supported only in proto3. - * - * \param field - * Field descriptor for member. - * \param member - * Field to encode. - * \return - * Number of bytes required. - */ -static size_t PaddleMobile__Framework__unlabeled_field_get_packed_size( - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field, - const void *member) { - if (PaddleMobile__Framework__field_is_zeroish(field, member)) return 0; - return PaddleMobile__Framework__required_field_get_packed_size(field, member); -} - -/** - * Calculate the serialized size of repeated message fields, which may consist - * of any number of values (including 0). Includes the space needed by the - * preceding tags (as needed). - * - * \param field - * Field descriptor for member. - * \param count - * Number of repeated field members. - * \param member - * Field to encode. - * \return - * Number of bytes required. - */ -static size_t PaddleMobile__Framework__repeated_field_get_packed_size( - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field, - size_t count, const void *member) { - size_t header_size; - size_t rv = 0; - unsigned i; - void *array = *(void *const *)member; - - if (count == 0) return 0; - header_size = get_tag_size(field->id); - if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count; - - switch (field->type) { - case PROTOBUF_C_TYPE_SINT32: - for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]); - break; - case PROTOBUF_C_TYPE_ENUM: - case PROTOBUF_C_TYPE_INT32: - for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]); - break; - case PROTOBUF_C_TYPE_UINT32: - for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]); - break; - case PROTOBUF_C_TYPE_SINT64: - for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]); - break; - case PROTOBUF_C_TYPE_INT64: - case PROTOBUF_C_TYPE_UINT64: - for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]); - break; - case PROTOBUF_C_TYPE_SFIXED32: - case PROTOBUF_C_TYPE_FIXED32: - case PROTOBUF_C_TYPE_FLOAT: - rv += 4 * count; - break; - case PROTOBUF_C_TYPE_SFIXED64: - case PROTOBUF_C_TYPE_FIXED64: - case PROTOBUF_C_TYPE_DOUBLE: - rv += 8 * count; - break; - case PROTOBUF_C_TYPE_BOOL: - rv += count; - break; - case PROTOBUF_C_TYPE_STRING: - for (i = 0; i < count; i++) { - size_t len = strlen(((char **)array)[i]); - rv += uint32_size(len) + len; - } - break; - case PROTOBUF_C_TYPE_BYTES: - for (i = 0; i < count; i++) { - size_t len = - ((PaddleMobile__Framework__ProtobufCBinaryData *)array)[i].len; - rv += uint32_size(len) + len; - } - break; - case PROTOBUF_C_TYPE_MESSAGE: - for (i = 0; i < count; i++) { - size_t len = - PaddleMobile__Framework__protobuf_c_message_get_packed_size( - ((PaddleMobile__Framework__ProtobufCMessage **)array)[i]); - rv += uint32_size(len) + len; - } - break; - } - - if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) - header_size += uint32_size(rv); - return header_size + rv; -} - -/** - * Calculate the serialized size of an unknown field, i.e. one that is passed - * through mostly uninterpreted. This is required for forward compatibility if - * new fields are added to the message descriptor. - * - * \param field - * Unknown field type. - * \return - * Number of bytes required. - */ -static inline size_t PaddleMobile__Framework__unknown_field_get_packed_size( - const PaddleMobile__Framework__ProtobufCMessageUnknownField *field) { - return get_tag_size(field->tag) + field->len; -} - -/**@}*/ - -/* - * Calculate the serialized size of the message. - */ -size_t PaddleMobile__Framework__protobuf_c_message_get_packed_size( - const PaddleMobile__Framework__ProtobufCMessage *message) { - unsigned i; - size_t rv = 0; - - ASSERT_IS_MESSAGE(message); - for (i = 0; i < message->descriptor->n_fields; i++) { - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field = - message->descriptor->fields + i; - const void *member = ((const char *)message) + field->offset; - const void *qmember = ((const char *)message) + field->quantifier_offset; - - if (field->label == PROTOBUF_C_LABEL_REQUIRED) { - rv += PaddleMobile__Framework__required_field_get_packed_size(field, - member); - } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL || - field->label == PROTOBUF_C_LABEL_NONE) && - (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) { - rv += PaddleMobile__Framework__oneof_field_get_packed_size( - field, *(const uint32_t *)qmember, member); - } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) { - rv += PaddleMobile__Framework__optional_field_get_packed_size( - field, *(protobuf_c_boolean *)qmember, member); - } else if (field->label == PROTOBUF_C_LABEL_NONE) { - rv += PaddleMobile__Framework__unlabeled_field_get_packed_size(field, - member); - } else { - rv += PaddleMobile__Framework__repeated_field_get_packed_size( - field, *(const size_t *)qmember, member); - } - } - for (i = 0; i < message->n_unknown_fields; i++) - rv += PaddleMobile__Framework__unknown_field_get_packed_size( - &message->unknown_fields[i]); - return rv; -} - -/** - * \defgroup pack protobuf_c_message_pack() implementation - * - * Routines mainly used by protobuf_c_message_pack(). - * - * \ingroup internal - * @{ - */ - -/** - * Pack an unsigned 32-bit integer in base-128 varint encoding and return the - * number of bytes written, which must be 5 or less. - * - * \param value - * Value to encode. - * \param[out] out - * Packed value. - * \return - * Number of bytes written to `out`. - */ -static inline size_t PaddleMobile__Framework__uint32_pack(uint32_t value, - uint8_t *out) { - unsigned rv = 0; - - if (value >= 0x80) { - out[rv++] = value | 0x80; - value >>= 7; - if (value >= 0x80) { - out[rv++] = value | 0x80; - value >>= 7; - if (value >= 0x80) { - out[rv++] = value | 0x80; - value >>= 7; - if (value >= 0x80) { - out[rv++] = value | 0x80; - value >>= 7; - } - } - } - } - /* assert: value<128 */ - out[rv++] = value; - return rv; -} - -/** - * Pack a 64-bit unsigned integer using base-128 varint encoding and return the - * number of bytes written. - * - * \param value - * Value to encode. - * \param[out] out - * Packed value. - * \return - * Number of bytes written to `out`. - */ -static size_t PaddleMobile__Framework__uint64_pack(uint64_t value, - uint8_t *out) { - uint32_t hi = (uint32_t)(value >> 32); - uint32_t lo = (uint32_t)value; - unsigned rv; - - if (hi == 0) return PaddleMobile__Framework__uint32_pack((uint32_t)lo, out); - out[0] = (lo) | 0x80; - out[1] = (lo >> 7) | 0x80; - out[2] = (lo >> 14) | 0x80; - out[3] = (lo >> 21) | 0x80; - if (hi < 8) { - out[4] = (hi << 4) | (lo >> 28); - return 5; - } else { - out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80; - hi >>= 3; - } - rv = 5; - while (hi >= 128) { - out[rv++] = hi | 0x80; - hi >>= 7; - } - out[rv++] = hi; - return rv; -} - -/** - * Pack a PaddleMobile__Framework__ProtobufCBinaryData and return the number of - * bytes written. The output includes a length delimiter. - * - * \param bd - * PaddleMobile__Framework__ProtobufCBinaryData to encode. - * \param[out] out - * Packed value. - * \return - * Number of bytes written to `out`. - */ -static inline size_t PaddleMobile__Framework__binary_data_pack( - const PaddleMobile__Framework__ProtobufCBinaryData *bd, uint8_t *out) { - size_t len = bd->len; - size_t rv = PaddleMobile__Framework__uint32_pack(len, out); - memcpy(out + rv, bd->data, len); - return rv + len; -} - -/** - * Pack a field tag. - * - * Wire-type will be added in required_field_pack(). - * - * \todo Just call PaddleMobile__Framework__uint64_pack on 64-bit platforms. - * - * \param id - * Tag value to encode. - * \param[out] out - * Packed value. - * \return - * Number of bytes written to `out`. - */ -static size_t PaddleMobile__Framework__tag_pack(uint32_t id, uint8_t *out) { - if (id < (1UL << (32 - 3))) - return PaddleMobile__Framework__uint32_pack(id << 3, out); - else - return PaddleMobile__Framework__uint64_pack(((uint64_t)id) << 3, out); -} - -/** - * Given a field type, return the in-memory size. - * - * \todo Implement as a table lookup. - * - * \param type - * Field type. - * \return - * Size of the field. - */ -static inline size_t PaddleMobile__Framework__sizeof_elt_in_repeated_array( - PaddleMobile__Framework__ProtobufCType type) { - switch (type) { - case PROTOBUF_C_TYPE_SINT32: - case PROTOBUF_C_TYPE_INT32: - case PROTOBUF_C_TYPE_UINT32: - case PROTOBUF_C_TYPE_SFIXED32: - case PROTOBUF_C_TYPE_FIXED32: - case PROTOBUF_C_TYPE_FLOAT: - case PROTOBUF_C_TYPE_ENUM: - return 4; - case PROTOBUF_C_TYPE_SINT64: - case PROTOBUF_C_TYPE_INT64: - case PROTOBUF_C_TYPE_UINT64: - case PROTOBUF_C_TYPE_SFIXED64: - case PROTOBUF_C_TYPE_FIXED64: - case PROTOBUF_C_TYPE_DOUBLE: - return 8; - case PROTOBUF_C_TYPE_BOOL: - return sizeof(protobuf_c_boolean); - case PROTOBUF_C_TYPE_STRING: - case PROTOBUF_C_TYPE_MESSAGE: - return sizeof(void *); - case PROTOBUF_C_TYPE_BYTES: - return sizeof(PaddleMobile__Framework__ProtobufCBinaryData); - } - PROTOBUF_C__ASSERT_NOT_REACHED(); - return 0; -} - -static inline int PaddleMobile__Framework__int_range_lookup( - unsigned n_ranges, const PaddleMobile__Framework__ProtobufCIntRange *ranges, - int value) { - unsigned n; - unsigned start; - - if (n_ranges == 0) return -1; - start = 0; - n = n_ranges; - while (n > 1) { - unsigned mid = start + n / 2; - - if (value < ranges[mid].start_value) { - n = mid - start; - } else if (value >= - ranges[mid].start_value + - (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) { - unsigned new_start = mid + 1; - n = start + n - new_start; - start = new_start; - } else - return (value - ranges[mid].start_value) + ranges[mid].orig_index; - } - if (n > 0) { - unsigned start_orig_index = ranges[start].orig_index; - unsigned range_size = ranges[start + 1].orig_index - start_orig_index; - - if (ranges[start].start_value <= value && - value < (int)(ranges[start].start_value + range_size)) { - return (value - ranges[start].start_value) + start_orig_index; - } - } - return -1; -} - -static size_t PaddleMobile__Framework__parse_tag_and_wiretype( - size_t len, const uint8_t *data, uint32_t *tag_out, - PaddleMobile__Framework__ProtobufCWireType *wiretype_out) { - unsigned max_rv = len > 5 ? 5 : len; - uint32_t tag = (data[0] & 0x7f) >> 3; - unsigned shift = 4; - unsigned rv; - - *wiretype_out = (PaddleMobile__Framework__ProtobufCWireType)(data[0] & 7); - if ((data[0] & 0x80) == 0) { - *tag_out = tag; - return 1; - } - for (rv = 1; rv < max_rv; rv++) { - if (data[rv] & 0x80) { - tag |= (data[rv] & 0x7f) << shift; - shift += 7; - } else { - tag |= data[rv] << shift; - *tag_out = tag; - return rv + 1; - } - } - return 0; /* error: bad header */ -} - -/* sizeof(ScannedMember) must be <= (1UL< len) { - PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val); - return 0; - } - return hdr_len + val; -} - -static size_t PaddleMobile__Framework__max_b128_numbers(size_t len, - const uint8_t *data) { - size_t rv = 0; - while (len--) - if ((*data++ & 0x80) == 0) ++rv; - return rv; -} - -/**@}*/ - -/** - * Merge earlier message into a latter message. - * - * For numeric types and strings, if the same value appears multiple - * times, the parser accepts the last value it sees. For embedded - * message fields, the parser merges multiple instances of the same - * field. That is, all singular scalar fields in the latter instance - * replace those in the former, singular embedded messages are merged, - * and repeated fields are concatenated. - * - * The earlier message should be freed after calling this function, as - * some of its fields may have been reused and changed to their default - * values during the merge. - */ -static protobuf_c_boolean PaddleMobile__Framework__merge_messages( - PaddleMobile__Framework__ProtobufCMessage *earlier_msg, - PaddleMobile__Framework__ProtobufCMessage *latter_msg, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - unsigned i; - const PaddleMobile__Framework__ProtobufCFieldDescriptor *fields = - latter_msg->descriptor->fields; - for (i = 0; i < latter_msg->descriptor->n_fields; i++) { - if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) { - size_t *n_earlier = - STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset); - uint8_t **p_earlier = - STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset); - size_t *n_latter = - STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset); - uint8_t **p_latter = - STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset); - - if (*n_earlier > 0) { - if (*n_latter > 0) { - /* Concatenate the repeated field */ - size_t el_size = - PaddleMobile__Framework__sizeof_elt_in_repeated_array( - fields[i].type); - uint8_t *new_field; - - new_field = (uint8_t *)PaddleMobile__Framework__do_alloc( - allocator, (*n_earlier + *n_latter) * el_size); - if (!new_field) return FALSE; - - memcpy(new_field, *p_earlier, *n_earlier * el_size); - memcpy(new_field + *n_earlier * el_size, *p_latter, - *n_latter * el_size); - - PaddleMobile__Framework__do_free(allocator, *p_latter); - PaddleMobile__Framework__do_free(allocator, *p_earlier); - *p_latter = new_field; - *n_latter = *n_earlier + *n_latter; - } else { - /* Zero copy the repeated field from the earlier message */ - *n_latter = *n_earlier; - *p_latter = *p_earlier; - } - /* Make sure the field does not get double freed */ - *n_earlier = 0; - *p_earlier = 0; - } - } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL || - fields[i].label == PROTOBUF_C_LABEL_NONE) { - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field; - uint32_t *earlier_case_p = - STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset); - uint32_t *latter_case_p = - STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset); - protobuf_c_boolean need_to_merge = FALSE; - void *earlier_elem; - void *latter_elem; - const void *def_val; - - if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) { - if (*latter_case_p == 0) { - /* lookup correct oneof field */ - int field_index = PaddleMobile__Framework__int_range_lookup( - latter_msg->descriptor->n_field_ranges, - latter_msg->descriptor->field_ranges, *earlier_case_p); - field = latter_msg->descriptor->fields + field_index; - } else { - /* Oneof is present in the latter message, move on */ - continue; - } - } else { - field = &fields[i]; - } - - earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset); - latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset); - def_val = field->default_value; - - switch (field->type) { - case PROTOBUF_C_TYPE_MESSAGE: { - PaddleMobile__Framework__ProtobufCMessage *em = - *(PaddleMobile__Framework__ProtobufCMessage **)earlier_elem; - PaddleMobile__Framework__ProtobufCMessage *lm = - *(PaddleMobile__Framework__ProtobufCMessage **)latter_elem; - if (em != NULL) { - if (lm != NULL) { - if (!PaddleMobile__Framework__merge_messages(em, lm, allocator)) - return FALSE; - /* Already merged */ - need_to_merge = FALSE; - } else { - /* Zero copy the message */ - need_to_merge = TRUE; - } - } - break; - } - case PROTOBUF_C_TYPE_BYTES: { - uint8_t *e_data = - ((PaddleMobile__Framework__ProtobufCBinaryData *)earlier_elem) - ->data; - uint8_t *l_data = - ((PaddleMobile__Framework__ProtobufCBinaryData *)latter_elem) - ->data; - const PaddleMobile__Framework__ProtobufCBinaryData *d_bd = - (PaddleMobile__Framework__ProtobufCBinaryData *)def_val; - - need_to_merge = - (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) && - (l_data == NULL || (d_bd != NULL && l_data == d_bd->data)); - break; - } - case PROTOBUF_C_TYPE_STRING: { - char *e_str = *(char **)earlier_elem; - char *l_str = *(char **)latter_elem; - const char *d_str = (const char *)def_val; - - need_to_merge = e_str != d_str && l_str == d_str; - break; - } - default: { - /* Could be has field or case enum, the logic is - * equivalent, since 0 (FALSE) means not set for - * oneof */ - need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0); - break; - } - } - - if (need_to_merge) { - size_t el_size = - PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type); - memcpy(latter_elem, earlier_elem, el_size); - /* - * Reset the element from the old message to 0 - * to make sure earlier message deallocation - * doesn't corrupt zero-copied data in the new - * message, earlier message will be freed after - * this function is called anyway - */ - memset(earlier_elem, 0, el_size); - - if (field->quantifier_offset != 0) { - /* Set the has field or the case enum, - * if applicable */ - *latter_case_p = *earlier_case_p; - *earlier_case_p = 0; - } - } - } - } - return TRUE; -} - -/** - * Count packed elements. - * - * Given a raw slab of packed-repeated values, determine the number of - * elements. This function detects certain kinds of errors but not - * others; the remaining error checking is done by - * PaddleMobile__Framework__parse_packed_repeated_member(). - */ -static protobuf_c_boolean PaddleMobile__Framework__count_packed_elements( - PaddleMobile__Framework__ProtobufCType type, size_t len, - const uint8_t *data, size_t *count_out) { - switch (type) { - case PROTOBUF_C_TYPE_SFIXED32: - case PROTOBUF_C_TYPE_FIXED32: - case PROTOBUF_C_TYPE_FLOAT: - if (len % 4 != 0) { - PROTOBUF_C_UNPACK_ERROR( - "length must be a multiple of 4 for fixed-length 32-bit types"); - return FALSE; - } - *count_out = len / 4; - return TRUE; - case PROTOBUF_C_TYPE_SFIXED64: - case PROTOBUF_C_TYPE_FIXED64: - case PROTOBUF_C_TYPE_DOUBLE: - if (len % 8 != 0) { - PROTOBUF_C_UNPACK_ERROR( - "length must be a multiple of 8 for fixed-length 64-bit types"); - return FALSE; - } - *count_out = len / 8; - return TRUE; - case PROTOBUF_C_TYPE_ENUM: - case PROTOBUF_C_TYPE_INT32: - case PROTOBUF_C_TYPE_SINT32: - case PROTOBUF_C_TYPE_UINT32: - case PROTOBUF_C_TYPE_INT64: - case PROTOBUF_C_TYPE_SINT64: - case PROTOBUF_C_TYPE_UINT64: - *count_out = PaddleMobile__Framework__max_b128_numbers(len, data); - return TRUE; - case PROTOBUF_C_TYPE_BOOL: - *count_out = len; - return TRUE; - case PROTOBUF_C_TYPE_STRING: - case PROTOBUF_C_TYPE_BYTES: - case PROTOBUF_C_TYPE_MESSAGE: - default: - PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated", - type); - return FALSE; - } -} - -static inline uint32_t PaddleMobile__Framework__parse_uint32( - unsigned len, const uint8_t *data) { - uint32_t rv = data[0] & 0x7f; - if (len > 1) { - rv |= ((uint32_t)(data[1] & 0x7f) << 7); - if (len > 2) { - rv |= ((uint32_t)(data[2] & 0x7f) << 14); - if (len > 3) { - rv |= ((uint32_t)(data[3] & 0x7f) << 21); - if (len > 4) rv |= ((uint32_t)(data[4]) << 28); - } - } - } - return rv; -} - -static inline uint32_t PaddleMobile__Framework__parse_int32( - unsigned len, const uint8_t *data) { - return PaddleMobile__Framework__parse_uint32(len, data); -} - -static inline int32_t unzigzag32(uint32_t v) { - if (v & 1) - return -(v >> 1) - 1; - else - return v >> 1; -} - -static inline uint32_t PaddleMobile__Framework__parse_fixed_uint32( - const uint8_t *data) { -#if !defined(WORDS_BIGENDIAN) - uint32_t t; - memcpy(&t, data, 4); - return t; -#else - return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) | - ((uint32_t)(data[3]) << 24); -#endif -} - -static uint64_t PaddleMobile__Framework__parse_uint64(unsigned len, - const uint8_t *data) { - unsigned shift, i; - uint64_t rv; - - if (len < 5) return PaddleMobile__Framework__parse_uint32(len, data); - rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) | - ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21); - shift = 28; - for (i = 4; i < len; i++) { - rv |= (((uint64_t)(data[i] & 0x7f)) << shift); - shift += 7; - } - return rv; -} - -static inline int64_t PaddleMobile__Framework__unzigzag64(uint64_t v) { - if (v & 1) - return -(v >> 1) - 1; - else - return v >> 1; -} - -static inline uint64_t PaddleMobile__Framework__parse_fixed_uint64( - const uint8_t *data) { -#if !defined(WORDS_BIGENDIAN) - uint64_t t; - memcpy(&t, data, 8); - return t; -#else - return (uint64_t)PaddleMobile__Framework__parse_fixed_uint32(data) | - (((uint64_t)PaddleMobile__Framework__parse_fixed_uint32(data + 4)) - << 32); -#endif -} - -static protobuf_c_boolean PaddleMobile__Framework__parse_boolean( - unsigned len, const uint8_t *data) { - unsigned i; - for (i = 0; i < len; i++) - if (data[i] & 0x7f) return TRUE; - return FALSE; -} - -static protobuf_c_boolean PaddleMobile__Framework__parse_required_member( - ScannedMember *scanned_member, void *member, - PaddleMobile__Framework__ProtobufCAllocator *allocator, - protobuf_c_boolean maybe_clear) { - unsigned len = scanned_member->len; - const uint8_t *data = scanned_member->data; - PaddleMobile__Framework__ProtobufCWireType wire_type = - (PaddleMobile__Framework__ProtobufCWireType)scanned_member->wire_type; - - switch (scanned_member->field->type) { - case PROTOBUF_C_TYPE_ENUM: - case PROTOBUF_C_TYPE_INT32: - if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE; - *(int32_t *)member = PaddleMobile__Framework__parse_int32(len, data); - return TRUE; - case PROTOBUF_C_TYPE_UINT32: - if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE; - *(uint32_t *)member = PaddleMobile__Framework__parse_uint32(len, data); - return TRUE; - case PROTOBUF_C_TYPE_SINT32: - if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE; - *(int32_t *)member = - unzigzag32(PaddleMobile__Framework__parse_uint32(len, data)); - return TRUE; - case PROTOBUF_C_TYPE_SFIXED32: - case PROTOBUF_C_TYPE_FIXED32: - case PROTOBUF_C_TYPE_FLOAT: - if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE; - *(uint32_t *)member = PaddleMobile__Framework__parse_fixed_uint32(data); - return TRUE; - case PROTOBUF_C_TYPE_INT64: - case PROTOBUF_C_TYPE_UINT64: - if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE; - *(uint64_t *)member = PaddleMobile__Framework__parse_uint64(len, data); - return TRUE; - case PROTOBUF_C_TYPE_SINT64: - if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE; - *(int64_t *)member = PaddleMobile__Framework__unzigzag64( - PaddleMobile__Framework__parse_uint64(len, data)); - return TRUE; - case PROTOBUF_C_TYPE_SFIXED64: - case PROTOBUF_C_TYPE_FIXED64: - case PROTOBUF_C_TYPE_DOUBLE: - if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE; - *(uint64_t *)member = PaddleMobile__Framework__parse_fixed_uint64(data); - return TRUE; - case PROTOBUF_C_TYPE_BOOL: - *(protobuf_c_boolean *)member = - PaddleMobile__Framework__parse_boolean(len, data); - return TRUE; - case PROTOBUF_C_TYPE_STRING: { - char **pstr = (char **)member; - unsigned pref_len = scanned_member->length_prefix_len; - - if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE; - - if (maybe_clear && *pstr != NULL) { - const char *def = (const char *)scanned_member->field->default_value; - if (*pstr != NULL && *pstr != def) - PaddleMobile__Framework__do_free(allocator, *pstr); - } - *pstr = (char *)PaddleMobile__Framework__do_alloc(allocator, - len - pref_len + 1); - if (*pstr == NULL) return FALSE; - memcpy(*pstr, data + pref_len, len - pref_len); - (*pstr)[len - pref_len] = 0; - return TRUE; - } - case PROTOBUF_C_TYPE_BYTES: { - PaddleMobile__Framework__ProtobufCBinaryData *bd = - (PaddleMobile__Framework__ProtobufCBinaryData *)member; - const PaddleMobile__Framework__ProtobufCBinaryData *def_bd; - unsigned pref_len = scanned_member->length_prefix_len; - - if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE; - - def_bd = (const PaddleMobile__Framework__ProtobufCBinaryData *) - scanned_member->field->default_value; - if (maybe_clear && bd->data != NULL && - (def_bd == NULL || bd->data != def_bd->data)) { - PaddleMobile__Framework__do_free(allocator, bd->data); - } - if (len - pref_len > 0) { - bd->data = (uint8_t *)PaddleMobile__Framework__do_alloc(allocator, - len - pref_len); - if (bd->data == NULL) return FALSE; - memcpy(bd->data, data + pref_len, len - pref_len); - } else { - bd->data = NULL; - } - bd->len = len - pref_len; - return TRUE; - } - case PROTOBUF_C_TYPE_MESSAGE: { - PaddleMobile__Framework__ProtobufCMessage **pmessage = - (PaddleMobile__Framework__ProtobufCMessage **)member; - PaddleMobile__Framework__ProtobufCMessage *subm; - const PaddleMobile__Framework__ProtobufCMessage *def_mess; - protobuf_c_boolean merge_successful = TRUE; - unsigned pref_len = scanned_member->length_prefix_len; - - if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE; - - def_mess = (const PaddleMobile__Framework__ProtobufCMessage *) - scanned_member->field->default_value; - subm = PaddleMobile__Framework__protobuf_c_message_unpack( - (const PaddleMobile__Framework__ProtobufCMessageDescriptor *) - scanned_member->field->descriptor, - allocator, len - pref_len, data + pref_len); - - if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) { - if (subm != NULL) - merge_successful = PaddleMobile__Framework__merge_messages( - *pmessage, subm, allocator); - /* Delete the previous message */ - PaddleMobile__Framework__protobuf_c_message_free_unpacked(*pmessage, - allocator); - } - *pmessage = subm; - if (subm == NULL || !merge_successful) return FALSE; - return TRUE; - } - } - return FALSE; -} - -static protobuf_c_boolean PaddleMobile__Framework__parse_oneof_member( - ScannedMember *scanned_member, void *member, - PaddleMobile__Framework__ProtobufCMessage *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - uint32_t *oneof_case = STRUCT_MEMBER_PTR( - uint32_t, message, scanned_member->field->quantifier_offset); - - /* If we have already parsed a member of this oneof, free it. */ - if (*oneof_case != 0) { - /* lookup field */ - int field_index = PaddleMobile__Framework__int_range_lookup( - message->descriptor->n_field_ranges, message->descriptor->field_ranges, - *oneof_case); - const PaddleMobile__Framework__ProtobufCFieldDescriptor *old_field = - message->descriptor->fields + field_index; - size_t el_size = - PaddleMobile__Framework__sizeof_elt_in_repeated_array(old_field->type); - - switch (old_field->type) { - case PROTOBUF_C_TYPE_STRING: { - char **pstr = (char **)member; - const char *def = (const char *)old_field->default_value; - if (*pstr != NULL && *pstr != def) - PaddleMobile__Framework__do_free(allocator, *pstr); - break; - } - case PROTOBUF_C_TYPE_BYTES: { - PaddleMobile__Framework__ProtobufCBinaryData *bd = - (PaddleMobile__Framework__ProtobufCBinaryData *)member; - const PaddleMobile__Framework__ProtobufCBinaryData *def_bd = - (const PaddleMobile__Framework__ProtobufCBinaryData *) - old_field->default_value; - if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) { - PaddleMobile__Framework__do_free(allocator, bd->data); - } - break; - } - case PROTOBUF_C_TYPE_MESSAGE: { - PaddleMobile__Framework__ProtobufCMessage **pmessage = - (PaddleMobile__Framework__ProtobufCMessage **)member; - const PaddleMobile__Framework__ProtobufCMessage *def_mess = - (const PaddleMobile__Framework__ProtobufCMessage *) - old_field->default_value; - if (*pmessage != NULL && *pmessage != def_mess) - PaddleMobile__Framework__protobuf_c_message_free_unpacked(*pmessage, - allocator); - break; - } - default: - break; - } - - memset(member, 0, el_size); - } - if (!PaddleMobile__Framework__parse_required_member(scanned_member, member, - allocator, TRUE)) - return FALSE; - - *oneof_case = scanned_member->tag; - return TRUE; -} - -static protobuf_c_boolean PaddleMobile__Framework__parse_optional_member( - ScannedMember *scanned_member, void *member, - PaddleMobile__Framework__ProtobufCMessage *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - if (!PaddleMobile__Framework__parse_required_member(scanned_member, member, - allocator, TRUE)) - return FALSE; - if (scanned_member->field->quantifier_offset != 0) - STRUCT_MEMBER(protobuf_c_boolean, message, - scanned_member->field->quantifier_offset) = TRUE; - return TRUE; -} - -static protobuf_c_boolean PaddleMobile__Framework__parse_repeated_member( - ScannedMember *scanned_member, void *member, - PaddleMobile__Framework__ProtobufCMessage *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field = - scanned_member->field; - size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset); - size_t siz = - PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type); - char *array = *(char **)member; - - if (!PaddleMobile__Framework__parse_required_member( - scanned_member, array + siz * (*p_n), allocator, FALSE)) { - return FALSE; - } - *p_n += 1; - return TRUE; -} - -static unsigned PaddleMobile__Framework__scan_varint(unsigned len, - const uint8_t *data) { - unsigned i; - if (len > 10) len = 10; - for (i = 0; i < len; i++) - if ((data[i] & 0x80) == 0) break; - if (i == len) return 0; - return i + 1; -} - -static protobuf_c_boolean PaddleMobile__Framework__parse_packed_repeated_member( - ScannedMember *scanned_member, void *member, - PaddleMobile__Framework__ProtobufCMessage *message) { - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field = - scanned_member->field; - size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset); - size_t siz = - PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type); - void *array = *(char **)member + siz * (*p_n); - const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len; - size_t rem = scanned_member->len - scanned_member->length_prefix_len; - size_t count = 0; - unsigned i; - - switch (field->type) { - case PROTOBUF_C_TYPE_SFIXED32: - case PROTOBUF_C_TYPE_FIXED32: - case PROTOBUF_C_TYPE_FLOAT: - count = (scanned_member->len - scanned_member->length_prefix_len) / 4; -#if !defined(WORDS_BIGENDIAN) - goto no_unpacking_needed; -#else - for (i = 0; i < count; i++) { - ((uint32_t *)array)[i] = - PaddleMobile__Framework__parse_fixed_uint32(at); - at += 4; - } - break; -#endif - case PROTOBUF_C_TYPE_SFIXED64: - case PROTOBUF_C_TYPE_FIXED64: - case PROTOBUF_C_TYPE_DOUBLE: - count = (scanned_member->len - scanned_member->length_prefix_len) / 8; -#if !defined(WORDS_BIGENDIAN) - goto no_unpacking_needed; -#else - for (i = 0; i < count; i++) { - ((uint64_t *)array)[i] = - PaddleMobile__Framework__parse_fixed_uint64(at); - at += 8; - } - break; -#endif - case PROTOBUF_C_TYPE_ENUM: - case PROTOBUF_C_TYPE_INT32: - while (rem > 0) { - unsigned s = PaddleMobile__Framework__scan_varint(rem, at); - if (s == 0) { - PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value"); - return FALSE; - } - ((int32_t *)array)[count++] = - PaddleMobile__Framework__parse_int32(s, at); - at += s; - rem -= s; - } - break; - case PROTOBUF_C_TYPE_SINT32: - while (rem > 0) { - unsigned s = PaddleMobile__Framework__scan_varint(rem, at); - if (s == 0) { - PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value"); - return FALSE; - } - ((int32_t *)array)[count++] = - unzigzag32(PaddleMobile__Framework__parse_uint32(s, at)); - at += s; - rem -= s; - } - break; - case PROTOBUF_C_TYPE_UINT32: - while (rem > 0) { - unsigned s = PaddleMobile__Framework__scan_varint(rem, at); - if (s == 0) { - PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value"); - return FALSE; - } - ((uint32_t *)array)[count++] = - PaddleMobile__Framework__parse_uint32(s, at); - at += s; - rem -= s; - } - break; - - case PROTOBUF_C_TYPE_SINT64: - while (rem > 0) { - unsigned s = PaddleMobile__Framework__scan_varint(rem, at); - if (s == 0) { - PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value"); - return FALSE; - } - ((int64_t *)array)[count++] = PaddleMobile__Framework__unzigzag64( - PaddleMobile__Framework__parse_uint64(s, at)); - at += s; - rem -= s; - } - break; - case PROTOBUF_C_TYPE_INT64: - case PROTOBUF_C_TYPE_UINT64: - while (rem > 0) { - unsigned s = PaddleMobile__Framework__scan_varint(rem, at); - if (s == 0) { - PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value"); - return FALSE; - } - ((int64_t *)array)[count++] = - PaddleMobile__Framework__parse_uint64(s, at); - at += s; - rem -= s; - } - break; - case PROTOBUF_C_TYPE_BOOL: - count = rem; - for (i = 0; i < count; i++) { - if (at[i] > 1) { - PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value"); - return FALSE; - } - ((protobuf_c_boolean *)array)[i] = at[i]; - } - break; - default: - PROTOBUF_C__ASSERT_NOT_REACHED(); - } - *p_n += count; - return TRUE; - -#if !defined(WORDS_BIGENDIAN) -no_unpacking_needed: - memcpy(array, at, count * siz); - *p_n += count; - return TRUE; -#endif -} - -static protobuf_c_boolean PaddleMobile__Framework__is_packable_type( - PaddleMobile__Framework__ProtobufCType type) { - return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES && - type != PROTOBUF_C_TYPE_MESSAGE; -} - -static protobuf_c_boolean PaddleMobile__Framework__parse_member( - ScannedMember *scanned_member, - PaddleMobile__Framework__ProtobufCMessage *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field = - scanned_member->field; - void *member; - - if (field == NULL) { - PaddleMobile__Framework__ProtobufCMessageUnknownField *ufield = - message->unknown_fields + (message->n_unknown_fields++); - ufield->tag = scanned_member->tag; - ufield->wire_type = - (PaddleMobile__Framework__ProtobufCWireType)scanned_member->wire_type; - ufield->len = scanned_member->len; - ufield->data = (uint8_t *)PaddleMobile__Framework__do_alloc( - allocator, scanned_member->len); - if (ufield->data == NULL) return FALSE; - memcpy(ufield->data, scanned_member->data, ufield->len); - return TRUE; - } - member = (char *)message + field->offset; - switch (field->label) { - case PROTOBUF_C_LABEL_REQUIRED: - return PaddleMobile__Framework__parse_required_member( - scanned_member, member, allocator, TRUE); - case PROTOBUF_C_LABEL_OPTIONAL: - case PROTOBUF_C_LABEL_NONE: - if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) { - return PaddleMobile__Framework__parse_oneof_member( - scanned_member, member, message, allocator); - } else { - return PaddleMobile__Framework__parse_optional_member( - scanned_member, member, message, allocator); - } - case PROTOBUF_C_LABEL_REPEATED: - if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED && - (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) || - PaddleMobile__Framework__is_packable_type(field->type))) { - return PaddleMobile__Framework__parse_packed_repeated_member( - scanned_member, member, message); - } else { - return PaddleMobile__Framework__parse_repeated_member( - scanned_member, member, message, allocator); - } - } - PROTOBUF_C__ASSERT_NOT_REACHED(); - return 0; -} - -/** - * Initialise messages generated by old code. - * - * This function is used if desc->message_init == NULL (which occurs - * for old code, and which would be useful to support allocating - * descriptors dynamically). - */ -static void PaddleMobile__Framework__message_init_generic( - const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc, - PaddleMobile__Framework__ProtobufCMessage *message) { - unsigned i; - - memset(message, 0, desc->sizeof_message); - message->descriptor = desc; - for (i = 0; i < desc->n_fields; i++) { - if (desc->fields[i].default_value != NULL && - desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) { - void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset); - const void *dv = desc->fields[i].default_value; - - switch (desc->fields[i].type) { - case PROTOBUF_C_TYPE_INT32: - case PROTOBUF_C_TYPE_SINT32: - case PROTOBUF_C_TYPE_SFIXED32: - case PROTOBUF_C_TYPE_UINT32: - case PROTOBUF_C_TYPE_FIXED32: - case PROTOBUF_C_TYPE_FLOAT: - case PROTOBUF_C_TYPE_ENUM: - memcpy(field, dv, 4); - break; - case PROTOBUF_C_TYPE_INT64: - case PROTOBUF_C_TYPE_SINT64: - case PROTOBUF_C_TYPE_SFIXED64: - case PROTOBUF_C_TYPE_UINT64: - case PROTOBUF_C_TYPE_FIXED64: - case PROTOBUF_C_TYPE_DOUBLE: - memcpy(field, dv, 8); - break; - case PROTOBUF_C_TYPE_BOOL: - memcpy(field, dv, sizeof(protobuf_c_boolean)); - break; - case PROTOBUF_C_TYPE_BYTES: - memcpy(field, dv, - sizeof(PaddleMobile__Framework__ProtobufCBinaryData)); - break; - - case PROTOBUF_C_TYPE_STRING: - case PROTOBUF_C_TYPE_MESSAGE: - /* - * The next line essentially implements a cast - * from const, which is totally unavoidable. - */ - *(const void **)field = dv; - break; - } - } - } -} - -/**@}*/ - -/* - * ScannedMember slabs (an unpacking implementation detail). Before doing real - * unpacking, we first scan through the elements to see how many there are (for - * repeated fields), and which field to use (for non-repeated fields given - * twice). - * - * In order to avoid allocations for small messages, we keep a stack-allocated - * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we - * fill that up, we allocate each slab twice as large as the previous one. - */ -#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4 - -/* - * The number of slabs, including the stack-allocated ones; choose the number so - * that we would overflow if we needed a slab larger than provided. - */ -#define MAX_SCANNED_MEMBER_SLAB \ - (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \ - FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2) - -#define REQUIRED_FIELD_BITMAP_SET(index) \ - (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8))) - -#define REQUIRED_FIELD_BITMAP_IS_SET(index) \ - (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8))) - -PaddleMobile__Framework__ProtobufCMessage * -PaddleMobile__Framework__protobuf_c_message_unpack( - const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc, - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data) { - PaddleMobile__Framework__ProtobufCMessage *rv; - size_t rem = len; - const uint8_t *at = data; - const PaddleMobile__Framework__ProtobufCFieldDescriptor *last_field = - desc->fields + 0; - ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2]; - - /* - * scanned_member_slabs[i] is an array of arrays of ScannedMember. - * The first slab (scanned_member_slabs[0] is just a pointer to - * first_member_slab), above. All subsequent slabs will be allocated - * using the allocator. - */ - ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1]; - unsigned which_slab = 0; /* the slab we are currently populating */ - unsigned in_slab_index = 0; /* number of members in the slab */ - size_t n_unknown = 0; - unsigned f; - unsigned j; - unsigned i_slab; - unsigned last_field_index = 0; - unsigned required_fields_bitmap_len; - unsigned char required_fields_bitmap_stack[16]; - unsigned char *required_fields_bitmap = required_fields_bitmap_stack; - protobuf_c_boolean required_fields_bitmap_alloced = FALSE; - - ASSERT_IS_MESSAGE_DESCRIPTOR(desc); - - if (allocator == NULL) allocator = &protobuf_c__allocator; - - rv = (PaddleMobile__Framework__ProtobufCMessage *) - PaddleMobile__Framework__do_alloc(allocator, desc->sizeof_message); - if (!rv) return (NULL); - scanned_member_slabs[0] = first_member_slab; - - required_fields_bitmap_len = (desc->n_fields + 7) / 8; - if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) { - required_fields_bitmap = (unsigned char *)PaddleMobile__Framework__do_alloc( - allocator, required_fields_bitmap_len); - if (!required_fields_bitmap) { - PaddleMobile__Framework__do_free(allocator, rv); - return (NULL); - } - required_fields_bitmap_alloced = TRUE; - } - memset(required_fields_bitmap, 0, required_fields_bitmap_len); - - /* - * Generated code always defines "message_init". However, we provide a - * fallback for (1) users of old protobuf-c generated-code that do not - * provide the function, and (2) descriptors constructed from some other - * source (most likely, direct construction from the .proto file). - */ - if (desc->message_init != NULL) - PaddleMobile__Framework__protobuf_c_message_init(desc, rv); - else - PaddleMobile__Framework__message_init_generic(desc, rv); - - while (rem > 0) { - uint32_t tag; - PaddleMobile__Framework__ProtobufCWireType wire_type; - size_t used = PaddleMobile__Framework__parse_tag_and_wiretype(rem, at, &tag, - &wire_type); - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field; - ScannedMember tmp; - - if (used == 0) { - PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u", - (unsigned)(at - data)); - goto error_cleanup_during_scan; - } - /* - * \todo Consider optimizing for field[1].id == tag, if field[1] - * exists! - */ - if (last_field == NULL || last_field->id != tag) { - /* lookup field */ - int field_index = PaddleMobile__Framework__int_range_lookup( - desc->n_field_ranges, desc->field_ranges, tag); - if (field_index < 0) { - field = NULL; - n_unknown++; - } else { - field = desc->fields + field_index; - last_field = field; - last_field_index = field_index; - } - } else { - field = last_field; - } - - if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED) - REQUIRED_FIELD_BITMAP_SET(last_field_index); - - at += used; - rem -= used; - tmp.tag = tag; - tmp.wire_type = wire_type; - tmp.field = field; - tmp.data = at; - tmp.length_prefix_len = 0; - - switch (wire_type) { - case PROTOBUF_C_WIRE_TYPE_VARINT: { - unsigned max_len = rem < 10 ? rem : 10; - unsigned i; - - for (i = 0; i < max_len; i++) - if ((at[i] & 0x80) == 0) break; - if (i == max_len) { - PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u", - (unsigned)(at - data)); - goto error_cleanup_during_scan; - } - tmp.len = i + 1; - break; - } - case PROTOBUF_C_WIRE_TYPE_64BIT: - if (rem < 8) { - PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u", - (unsigned)(at - data)); - goto error_cleanup_during_scan; - } - tmp.len = 8; - break; - case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: { - size_t pref_len; - - tmp.len = PaddleMobile__Framework__scan_length_prefixed_data(rem, at, - &pref_len); - if (tmp.len == 0) { - /* NOTE: PaddleMobile__Framework__scan_length_prefixed_data calls - * UNPACK_ERROR */ - goto error_cleanup_during_scan; - } - tmp.length_prefix_len = pref_len; - break; - } - case PROTOBUF_C_WIRE_TYPE_32BIT: - if (rem < 4) { - PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u", - (unsigned)(at - data)); - goto error_cleanup_during_scan; - } - tmp.len = 4; - break; - default: - PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type, - (unsigned)(at - data)); - goto error_cleanup_during_scan; - } - - if (in_slab_index == - (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) { - size_t size; - - in_slab_index = 0; - if (which_slab == MAX_SCANNED_MEMBER_SLAB) { - PROTOBUF_C_UNPACK_ERROR("too many fields"); - goto error_cleanup_during_scan; - } - which_slab++; - size = sizeof(ScannedMember) - << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2); - scanned_member_slabs[which_slab] = - (ScannedMember *)PaddleMobile__Framework__do_alloc(allocator, size); - if (scanned_member_slabs[which_slab] == NULL) - goto error_cleanup_during_scan; - } - scanned_member_slabs[which_slab][in_slab_index++] = tmp; - - if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) { - size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset); - if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED && - (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) || - PaddleMobile__Framework__is_packable_type(field->type))) { - size_t count; - if (!PaddleMobile__Framework__count_packed_elements( - field->type, tmp.len - tmp.length_prefix_len, - tmp.data + tmp.length_prefix_len, &count)) { - PROTOBUF_C_UNPACK_ERROR("counting packed elements"); - goto error_cleanup_during_scan; - } - *n += count; - } else { - *n += 1; - } - } - - at += tmp.len; - rem -= tmp.len; - } - - /* allocate space for repeated fields, also check that all required fields - * have been set */ - for (f = 0; f < desc->n_fields; f++) { - const PaddleMobile__Framework__ProtobufCFieldDescriptor *field = - desc->fields + f; - if (field->label == PROTOBUF_C_LABEL_REPEATED) { - size_t siz = - PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type); - size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset); - if (*n_ptr != 0) { - unsigned n = *n_ptr; - void *a; - *n_ptr = 0; - assert(rv->descriptor != NULL); -#define CLEAR_REMAINING_N_PTRS() \ - for (f++; f < desc->n_fields; f++) { \ - field = desc->fields + f; \ - if (field->label == PROTOBUF_C_LABEL_REPEATED) \ - STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \ - } - a = PaddleMobile__Framework__do_alloc(allocator, siz * n); - if (!a) { - CLEAR_REMAINING_N_PTRS(); - goto error_cleanup; - } - STRUCT_MEMBER(void *, rv, field->offset) = a; - } - } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) { - if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) { - CLEAR_REMAINING_N_PTRS(); - PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'", - desc->name, field->name); - goto error_cleanup; - } - } - } -#undef CLEAR_REMAINING_N_PTRS - - /* allocate space for unknown fields */ - if (n_unknown) { - rv->unknown_fields = - (PaddleMobile__Framework__ProtobufCMessageUnknownField *) - PaddleMobile__Framework__do_alloc( - allocator, - n_unknown * - sizeof( - PaddleMobile__Framework__ProtobufCMessageUnknownField)); - if (rv->unknown_fields == NULL) goto error_cleanup; - } - - /* do real parsing */ - for (i_slab = 0; i_slab <= which_slab; i_slab++) { - unsigned max = - (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4)); - ScannedMember *slab = scanned_member_slabs[i_slab]; - - for (j = 0; j < max; j++) { - if (!PaddleMobile__Framework__parse_member(slab + j, rv, allocator)) { - PROTOBUF_C_UNPACK_ERROR( - "error parsing member %s of %s", - slab->field ? slab->field->name : "*unknown-field*", desc->name); - goto error_cleanup; - } - } - } - - /* cleanup */ - for (j = 1; j <= which_slab; j++) - PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]); - if (required_fields_bitmap_alloced) - PaddleMobile__Framework__do_free(allocator, required_fields_bitmap); - return rv; - -error_cleanup: - PaddleMobile__Framework__protobuf_c_message_free_unpacked(rv, allocator); - for (j = 1; j <= which_slab; j++) - PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]); - if (required_fields_bitmap_alloced) - PaddleMobile__Framework__do_free(allocator, required_fields_bitmap); - return NULL; - -error_cleanup_during_scan: - PaddleMobile__Framework__do_free(allocator, rv); - for (j = 1; j <= which_slab; j++) - PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]); - if (required_fields_bitmap_alloced) - PaddleMobile__Framework__do_free(allocator, required_fields_bitmap); - return NULL; -} - -void PaddleMobile__Framework__protobuf_c_message_free_unpacked( - PaddleMobile__Framework__ProtobufCMessage *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator) { - const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc; - unsigned f; - - if (message == NULL) return; - - desc = message->descriptor; - - ASSERT_IS_MESSAGE(message); - - if (allocator == NULL) allocator = &protobuf_c__allocator; - message->descriptor = NULL; - for (f = 0; f < desc->n_fields; f++) { - if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) && - desc->fields[f].id != - STRUCT_MEMBER(uint32_t, message, - desc->fields[f].quantifier_offset)) { - /* This is not the selected oneof, skip it */ - continue; - } - - if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) { - size_t n = - STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset); - void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset); - - if (arr != NULL) { - if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) { - unsigned i; - for (i = 0; i < n; i++) - PaddleMobile__Framework__do_free(allocator, ((char **)arr)[i]); - } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) { - unsigned i; - for (i = 0; i < n; i++) - PaddleMobile__Framework__do_free( - allocator, - ((PaddleMobile__Framework__ProtobufCBinaryData *)arr)[i].data); - } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) { - unsigned i; - for (i = 0; i < n; i++) - PaddleMobile__Framework__protobuf_c_message_free_unpacked( - ((PaddleMobile__Framework__ProtobufCMessage **)arr)[i], - allocator); - } - PaddleMobile__Framework__do_free(allocator, arr); - } - } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) { - char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset); - - if (str && str != desc->fields[f].default_value) - PaddleMobile__Framework__do_free(allocator, str); - } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) { - void *data = STRUCT_MEMBER(PaddleMobile__Framework__ProtobufCBinaryData, - message, desc->fields[f].offset) - .data; - const PaddleMobile__Framework__ProtobufCBinaryData *default_bd; - - default_bd = - (const PaddleMobile__Framework__ProtobufCBinaryData *)desc->fields[f] - .default_value; - if (data != NULL && (default_bd == NULL || default_bd->data != data)) { - PaddleMobile__Framework__do_free(allocator, data); - } - } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) { - PaddleMobile__Framework__ProtobufCMessage *sm; - - sm = STRUCT_MEMBER(PaddleMobile__Framework__ProtobufCMessage *, message, - desc->fields[f].offset); - if (sm && sm != desc->fields[f].default_value) - PaddleMobile__Framework__protobuf_c_message_free_unpacked(sm, - allocator); - } - } - - for (f = 0; f < message->n_unknown_fields; f++) - PaddleMobile__Framework__do_free(allocator, - message->unknown_fields[f].data); - if (message->unknown_fields != NULL) - PaddleMobile__Framework__do_free(allocator, message->unknown_fields); - - PaddleMobile__Framework__do_free(allocator, message); -} - -void PaddleMobile__Framework__protobuf_c_message_init( - const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor, - void *message) { - descriptor->message_init( - (PaddleMobile__Framework__ProtobufCMessage *)(message)); -} - -protobuf_c_boolean PaddleMobile__Framework__protobuf_c_message_check( - const PaddleMobile__Framework__ProtobufCMessage *message) { - unsigned i; - - if (!message || !message->descriptor || - message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) { - return FALSE; - } - - for (i = 0; i < message->descriptor->n_fields; i++) { - const PaddleMobile__Framework__ProtobufCFieldDescriptor *f = - message->descriptor->fields + i; - PaddleMobile__Framework__ProtobufCType type = f->type; - PaddleMobile__Framework__ProtobufCLabel label = f->label; - void *field = STRUCT_MEMBER_P(message, f->offset); - - if (label == PROTOBUF_C_LABEL_REPEATED) { - size_t *quantity = - (size_t *)STRUCT_MEMBER_P(message, f->quantifier_offset); - - if (*quantity > 0 && *(void **)field == NULL) { - return FALSE; - } - - if (type == PROTOBUF_C_TYPE_MESSAGE) { - PaddleMobile__Framework__ProtobufCMessage **submessage = - *(PaddleMobile__Framework__ProtobufCMessage ***)field; - unsigned j; - for (j = 0; j < *quantity; j++) { - if (!PaddleMobile__Framework__protobuf_c_message_check(submessage[j])) - return FALSE; - } - } else if (type == PROTOBUF_C_TYPE_STRING) { - char **string = *(char ***)field; - unsigned j; - for (j = 0; j < *quantity; j++) { - if (!string[j]) return FALSE; - } - } else if (type == PROTOBUF_C_TYPE_BYTES) { - PaddleMobile__Framework__ProtobufCBinaryData *bd = - *(PaddleMobile__Framework__ProtobufCBinaryData **)field; - unsigned j; - for (j = 0; j < *quantity; j++) { - if (bd[j].len > 0 && bd[j].data == NULL) return FALSE; - } - } - - } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */ - - if (type == PROTOBUF_C_TYPE_MESSAGE) { - PaddleMobile__Framework__ProtobufCMessage *submessage = - *(PaddleMobile__Framework__ProtobufCMessage **)field; - if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) { - if (!PaddleMobile__Framework__protobuf_c_message_check(submessage)) - return FALSE; - } - } else if (type == PROTOBUF_C_TYPE_STRING) { - char *string = *(char **)field; - if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE; - } else if (type == PROTOBUF_C_TYPE_BYTES) { - protobuf_c_boolean *has = (protobuf_c_boolean *)STRUCT_MEMBER_P( - message, f->quantifier_offset); - PaddleMobile__Framework__ProtobufCBinaryData *bd = - (PaddleMobile__Framework__ProtobufCBinaryData *)field; - if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) { - if (bd->len > 0 && bd->data == NULL) return FALSE; - } - } - } - } - - return TRUE; -} - -/* === services === */ - -typedef void (*GenericHandler)( - void *service, const PaddleMobile__Framework__ProtobufCMessage *input, - ProtobufCClosure closure, void *closure_data); diff --git a/mobile/src/protobuf-c/protobuf-c.h b/mobile/src/protobuf-c/protobuf-c.h deleted file mode 100644 index ffb86e8612..0000000000 --- a/mobile/src/protobuf-c/protobuf-c.h +++ /dev/null @@ -1,962 +0,0 @@ -/* - * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/*! \file - * \mainpage Introduction - * - * This is [protobuf-c], a C implementation of [Protocol Buffers]. - * - * This file defines the public API for the `libprotobuf-c` support library. - * This API includes interfaces that can be used directly by client code as well - * as the interfaces used by the code generated by the `protoc-c` compiler. - * - * The `libprotobuf-c` support library performs the actual serialization and - * deserialization of Protocol Buffers messages. It interacts with structures, - * definitions, and metadata generated by the `protoc-c` compiler from .proto - * files. - * - * \authors Dave Benson and the `protobuf-c` authors. - * - * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license. - * - * [protobuf-c]: https://github.com/protobuf-c/protobuf-c - * [Protocol Buffers]: https://developers.google.com/protocol-buffers/ - * [BSD-2-Clause]: http://opensource.org/licenses/BSD-2-Clause - * - * \page gencode Generated Code - * - * For each enum, we generate a C enum. For each message, we generate a C - * structure which can be cast to a `PaddleMobile__Framework__ProtobufCMessage`. - * - * For each enum and message, we generate a descriptor object that allows us to - * implement a kind of reflection on the structures. - * - * First, some naming conventions: - * - * - The name of the type for enums and messages and services is camel case - * (meaning WordsAreCrammedTogether) except that double underscores are used - * to delimit scopes. For example, the following `.proto` file: - * -~~~{.proto} - package foo.bar; - message BazBah { - optional int32 val = 1; - } -~~~ - * - * would generate a C type `Foo__Bar__BazBah`. - * - * - Identifiers for functions and globals are all lowercase, with camel case - * words separated by single underscores. For example, one of the function - * prototypes generated by `protoc-c` for the above example: - * -~~~{.c} -Foo__Bar__BazBah * - foo__bar__baz_bah__unpack - (PaddleMobile__Framework__ProtobufCAllocator *allocator, - size_t len, - const uint8_t *data); -~~~ - * - * - Identifiers for enum values contain an uppercase prefix which embeds the - * package name and the enum type name. - * - * - A double underscore is used to separate further components of identifier - * names. - * - * For example, in the name of the unpack function above, the package name - * `foo.bar` has become `foo__bar`, the message name BazBah has become - * `baz_bah`, and the method name is `unpack`. These are all joined with double - * underscores to form the C identifier `foo__bar__baz_bah__unpack`. - * - * We also generate descriptor objects for messages and enums. These are - * declared in the `.pb-c.h` files: - * -~~~{.c} -extern const PaddleMobile__Framework__ProtobufCMessageDescriptor -foo__bar__baz_bah__descriptor; -~~~ - * - * The message structures all begin with -`PaddleMobile__Framework__ProtobufCMessageDescriptor *` which is - * sufficient to allow them to be cast to -`PaddleMobile__Framework__ProtobufCMessage`. - * - * For each message defined in a `.proto` file, we generate a number of - * functions and macros. Each function name contains a prefix based on the - * package name and message name in order to make it a unique C identifier. - * - * - `INIT`. Statically initializes a message object, initializing its - * descriptor and setting its fields to default values. Uninitialized - * messages cannot be processed by the protobuf-c library. - * -~~~{.c} -#define FOO__BAR__BAZ_BAH__INIT \ - { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 } -~~~ - * - `init()`. Initializes a message object, initializing its descriptor and - * setting its fields to default values. Uninitialized messages cannot be - * processed by the protobuf-c library. - * -~~~{.c} -void foo__bar__baz_bah__init - (Foo__Bar__BazBah *message); -~~~ - * - `unpack()`. Unpacks data for a particular message format. Note that the - * `allocator` parameter is usually `NULL` to indicate that the system's - * `malloc()` and `free()` functions should be used for dynamically allocating - * memory. - * -~~~{.c} -Foo__Bar__BazBah * - foo__bar__baz_bah__unpack - (PaddleMobile__Framework__ProtobufCAllocator *allocator, - size_t len, - const uint8_t *data); -~~~ - * - * - `free_unpacked()`. Frees a message object obtained with the `unpack()` - * method. Freeing `NULL` is allowed (the same as with `free()`). - * -~~~{.c} -void foo__bar__baz_bah__free_unpacked - (Foo__Bar__BazBah *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); -~~~ - * - * - `get_packed_size()`. Calculates the length in bytes of the serialized - * representation of the message object. - * -~~~{.c} -size_t foo__bar__baz_bah__get_packed_size - (const Foo__Bar__BazBah *message); -~~~ - * - * - `pack()`. Pack a message object into a preallocated buffer. Assumes that - * the buffer is large enough. (Use `get_packed_size()` first.) - * -~~~{.c} -size_t foo__bar__baz_bah__pack - (const Foo__Bar__BazBah *message, - uint8_t *out); -~~~ - * - * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an - * object which defines an "append bytes" callback to consume data as it is - * serialized. - * -~~~{.c} -size_t foo__bar__baz_bah__pack_to_buffer - (const Foo__Bar__BazBah *message, - PaddleMobile__Framework__ProtobufCBuffer *buffer); -~~~ - * - * \page pack Packing and unpacking messages - * - * To pack a message, first compute the packed size of the message with - * PaddleMobile__Framework__protobuf_c_message_get_packed_size(), then allocate -a buffer of at least - * that size, then call protobuf_c_message_pack(). - * - * Alternatively, a message can be serialized without calculating the final size - * first. Use the protobuf_c_message_pack_to_buffer() function and provide a - * PaddleMobile__Framework__ProtobufCBuffer object which implements an "append" -method that consumes - * data. - * - * To unpack a message, call the -PaddleMobile__Framework__protobuf_c_message_unpack() function. The - * result can be cast to an object of the type that matches the descriptor for - * the message. - * - * The result of unpacking a message should be freed with - * PaddleMobile__Framework__protobuf_c_message_free_unpacked(). - */ - -#ifndef PROTOBUF_C_H -#define PROTOBUF_C_H - -#include -#include -#include -#include - -#ifdef __cplusplus -#define PROTOBUF_C__BEGIN_DECLS extern "C" { -#define PROTOBUF_C__END_DECLS } -#else -#define PROTOBUF_C__BEGIN_DECLS -#define PROTOBUF_C__END_DECLS -#endif - -PROTOBUF_C__BEGIN_DECLS - -#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB) -#ifdef PROTOBUF_C_EXPORT -#define PROTOBUF_C__API __declspec(dllexport) -#else -#define PROTOBUF_C__API __declspec(dllimport) -#endif -#else -#define PROTOBUF_C__API -#endif - -#if !defined(PROTOBUF_C__NO_DEPRECATED) && \ - ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) -#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__)) -#else -#define PROTOBUF_C__DEPRECATED -#endif - -#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE -#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \ - , _##enum_name##_IS_INT_SIZE = INT_MAX -#endif - -#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3 -#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9 -#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af - -/* Empty string used for initializers */ -extern const char PaddleMobile__Framework__protobuf_c_empty_string[]; - -/** - * \defgroup api Public API - * - * This is the public API for `libprotobuf-c`. These interfaces are stable and - * subject to Semantic Versioning guarantees. - * - * @{ - */ - -/** - * Values for the `flags` word in - * `PaddleMobile__Framework__ProtobufCFieldDescriptor`. - */ -typedef enum { - /** Set if the field is repeated and marked with the `packed` option. */ - PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0), - - /** Set if the field is marked with the `deprecated` option. */ - PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1), - - /** Set if the field is a member of a oneof (union). */ - PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2), -} PaddleMobile__Framework__ProtobufCFieldFlag; - -/** - * Message field rules. - * - * \see [Defining A Message Type] in the Protocol Buffers documentation. - * - * [Defining A Message Type]: - * https://developers.google.com/protocol-buffers/docs/proto#simple - */ -typedef enum { - /** A well-formed message must have exactly one of this field. */ - PROTOBUF_C_LABEL_REQUIRED, - - /** - * A well-formed message can have zero or one of this field (but not - * more than one). - */ - PROTOBUF_C_LABEL_OPTIONAL, - - /** - * This field can be repeated any number of times (including zero) in a - * well-formed message. The order of the repeated values will be - * preserved. - */ - PROTOBUF_C_LABEL_REPEATED, - - /** - * This field has no label. This is valid only in proto3 and is - * equivalent to OPTIONAL but no "has" quantifier will be consulted. - */ - PROTOBUF_C_LABEL_NONE, -} PaddleMobile__Framework__ProtobufCLabel; - -/** - * Field value types. - * - * \see [Scalar Value Types] in the Protocol Buffers documentation. - * - * [Scalar Value Types]: - * https://developers.google.com/protocol-buffers/docs/proto#scalar - */ -typedef enum { - PROTOBUF_C_TYPE_INT32, /**< int32 */ - PROTOBUF_C_TYPE_SINT32, /**< signed int32 */ - PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */ - PROTOBUF_C_TYPE_INT64, /**< int64 */ - PROTOBUF_C_TYPE_SINT64, /**< signed int64 */ - PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */ - PROTOBUF_C_TYPE_UINT32, /**< unsigned int32 */ - PROTOBUF_C_TYPE_FIXED32, /**< unsigned int32 (4 bytes) */ - PROTOBUF_C_TYPE_UINT64, /**< unsigned int64 */ - PROTOBUF_C_TYPE_FIXED64, /**< unsigned int64 (8 bytes) */ - PROTOBUF_C_TYPE_FLOAT, /**< float */ - PROTOBUF_C_TYPE_DOUBLE, /**< double */ - PROTOBUF_C_TYPE_BOOL, /**< boolean */ - PROTOBUF_C_TYPE_ENUM, /**< enumerated type */ - PROTOBUF_C_TYPE_STRING, /**< UTF-8 or ASCII string */ - PROTOBUF_C_TYPE_BYTES, /**< arbitrary byte sequence */ - PROTOBUF_C_TYPE_MESSAGE, /**< nested message */ -} PaddleMobile__Framework__ProtobufCType; - -/** - * Field wire types. - * - * \see [Message Structure] in the Protocol Buffers documentation. - * - * [Message Structure]: - * https://developers.google.com/protocol-buffers/docs/encoding#structure - */ -typedef enum { - PROTOBUF_C_WIRE_TYPE_VARINT = 0, - PROTOBUF_C_WIRE_TYPE_64BIT = 1, - PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2, - /* "Start group" and "end group" wire types are unsupported. */ - PROTOBUF_C_WIRE_TYPE_32BIT = 5, -} PaddleMobile__Framework__ProtobufCWireType; - -struct PaddleMobile__Framework__ProtobufCAllocator; -struct PaddleMobile__Framework__ProtobufCBinaryData; -struct PaddleMobile__Framework__ProtobufCBuffer; -struct PaddleMobile__Framework__ProtobufCBufferSimple; -struct PaddleMobile__Framework__ProtobufCEnumDescriptor; -struct PaddleMobile__Framework__ProtobufCEnumValue; -struct PaddleMobile__Framework__ProtobufCEnumValueIndex; -struct PaddleMobile__Framework__ProtobufCFieldDescriptor; -struct PaddleMobile__Framework__ProtobufCIntRange; -struct PaddleMobile__Framework__ProtobufCMessage; -struct PaddleMobile__Framework__ProtobufCMessageDescriptor; -struct PaddleMobile__Framework__ProtobufCMessageUnknownField; -struct PaddleMobile__Framework__ProtobufCMethodDescriptor; -struct PaddleMobile__Framework__ProtobufCService; -struct PaddleMobile__Framework__ProtobufCServiceDescriptor; - -typedef struct PaddleMobile__Framework__ProtobufCAllocator - PaddleMobile__Framework__ProtobufCAllocator; -typedef struct PaddleMobile__Framework__ProtobufCBinaryData - PaddleMobile__Framework__ProtobufCBinaryData; -typedef struct PaddleMobile__Framework__ProtobufCBuffer - PaddleMobile__Framework__ProtobufCBuffer; -typedef struct PaddleMobile__Framework__ProtobufCBufferSimple - PaddleMobile__Framework__ProtobufCBufferSimple; -typedef struct PaddleMobile__Framework__ProtobufCEnumDescriptor - PaddleMobile__Framework__ProtobufCEnumDescriptor; -typedef struct PaddleMobile__Framework__ProtobufCEnumValue - PaddleMobile__Framework__ProtobufCEnumValue; -typedef struct PaddleMobile__Framework__ProtobufCEnumValueIndex - PaddleMobile__Framework__ProtobufCEnumValueIndex; -typedef struct PaddleMobile__Framework__ProtobufCFieldDescriptor - PaddleMobile__Framework__ProtobufCFieldDescriptor; -typedef struct PaddleMobile__Framework__ProtobufCIntRange - PaddleMobile__Framework__ProtobufCIntRange; -typedef struct PaddleMobile__Framework__ProtobufCMessage - PaddleMobile__Framework__ProtobufCMessage; -typedef struct PaddleMobile__Framework__ProtobufCMessageDescriptor - PaddleMobile__Framework__ProtobufCMessageDescriptor; -typedef struct PaddleMobile__Framework__ProtobufCMessageUnknownField - PaddleMobile__Framework__ProtobufCMessageUnknownField; -typedef struct PaddleMobile__Framework__ProtobufCMethodDescriptor - PaddleMobile__Framework__ProtobufCMethodDescriptor; -typedef struct PaddleMobile__Framework__ProtobufCService - PaddleMobile__Framework__ProtobufCService; -typedef struct PaddleMobile__Framework__ProtobufCServiceDescriptor - PaddleMobile__Framework__ProtobufCServiceDescriptor; - -/** Boolean type. */ -typedef int protobuf_c_boolean; - -typedef void (*ProtobufCClosure)( - const PaddleMobile__Framework__ProtobufCMessage *, void *closure_data); -typedef void (*ProtobufCMessageInit)( - PaddleMobile__Framework__ProtobufCMessage *); -typedef void (*ProtobufCServiceDestroy)( - PaddleMobile__Framework__ProtobufCService *); - -/** - * Structure for defining a custom memory allocator. - */ -struct PaddleMobile__Framework__ProtobufCAllocator { - /** Function to allocate memory. */ - void *(*alloc)(void *allocator_data, size_t size); - - /** Function to free memory. */ - void (*free)(void *allocator_data, void *pointer); - - /** Opaque pointer passed to `alloc` and `free` functions. */ - void *allocator_data; -}; - -/** - * Structure for the protobuf `bytes` scalar type. - * - * The data contained in a `PaddleMobile__Framework__ProtobufCBinaryData` is an - * arbitrary sequence of bytes. It may contain embedded `NUL` characters and is - * not required to be `NUL`-terminated. - */ -struct PaddleMobile__Framework__ProtobufCBinaryData { - size_t len; /**< Number of bytes in the `data` field. */ - uint8_t *data; /**< Data bytes. */ -}; - -/** - * Structure for defining a virtual append-only buffer. Used by - * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized - * bytes. - * - * `PaddleMobile__Framework__ProtobufCBuffer` "subclasses" may be defined on the -stack. For example, to - * write to a `FILE` object: - * -~~~{.c} -typedef struct { - PaddleMobile__Framework__ProtobufCBuffer base; - FILE *fp; -} BufferAppendToFile; - -static void -my_buffer_file_append(PaddleMobile__Framework__ProtobufCBuffer *buffer, - size_t len, - const uint8_t *data) -{ - BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer; - fwrite(data, len, 1, file_buf->fp); // XXX: No error handling! -} -~~~ - * - * To use this new type of PaddleMobile__Framework__ProtobufCBuffer, it could be -called as follows: - * -~~~{.c} -... -BufferAppendToFile tmp = {0}; -tmp.base.append = my_buffer_file_append; -tmp.fp = fp; -protobuf_c_message_pack_to_buffer(&message, &tmp); -... -~~~ - */ -struct PaddleMobile__Framework__ProtobufCBuffer { - /** Append function. Consumes the `len` bytes stored at `data`. */ - void (*append)(PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len, - const uint8_t *data); -}; - -/** - * Simple buffer "subclass" of `PaddleMobile__Framework__ProtobufCBuffer`. - * - * A `PaddleMobile__Framework__ProtobufCBufferSimple` object is declared on the -stack and uses a - * scratch buffer provided by the user for the initial allocation. It performs - * exponential resizing, using dynamically allocated memory. A - * `PaddleMobile__Framework__ProtobufCBufferSimple` object can be created and -used as follows: - * -~~~{.c} -uint8_t pad[128]; -PaddleMobile__Framework__ProtobufCBufferSimple simple = -PROTOBUF_C_BUFFER_SIMPLE_INIT(pad); PaddleMobile__Framework__ProtobufCBuffer -*buffer = (PaddleMobile__Framework__ProtobufCBuffer *) &simple; -~~~ - * - * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a - * message has been serialized to a -`PaddleMobile__Framework__ProtobufCBufferSimple` object, the - * serialized data bytes can be accessed from the `.data` field. - * - * To free the memory allocated by a -`PaddleMobile__Framework__ProtobufCBufferSimple` object, if any, - * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example: - * -~~~{.c} -PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple); -~~~ - * - * \see PROTOBUF_C_BUFFER_SIMPLE_INIT - * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR - */ -struct PaddleMobile__Framework__ProtobufCBufferSimple { - /** "Base class". */ - PaddleMobile__Framework__ProtobufCBuffer base; - /** Number of bytes allocated in `data`. */ - size_t alloced; - /** Number of bytes currently stored in `data`. */ - size_t len; - /** Data bytes. */ - uint8_t *data; - /** Whether `data` must be freed. */ - protobuf_c_boolean must_free_data; - /** Allocator to use. May be NULL to indicate the system allocator. */ - PaddleMobile__Framework__ProtobufCAllocator *allocator; -}; - -/** - * Describes an enumeration as a whole, with all of its values. - */ -struct PaddleMobile__Framework__ProtobufCEnumDescriptor { - /** Magic value checked to ensure that the API is used correctly. */ - uint32_t magic; - - /** The qualified name (e.g., "namespace.Type"). */ - const char *name; - /** The unqualified name as given in the .proto file (e.g., "Type"). */ - const char *short_name; - /** Identifier used in generated C code. */ - const char *c_name; - /** The dot-separated namespace. */ - const char *package_name; - - /** Number elements in `values`. */ - unsigned n_values; - /** Array of distinct values, sorted by numeric value. */ - const PaddleMobile__Framework__ProtobufCEnumValue *values; - - /** Number of elements in `values_by_name`. */ - unsigned n_value_names; - /** Array of named values, including aliases, sorted by name. */ - const PaddleMobile__Framework__ProtobufCEnumValueIndex *values_by_name; - - /** Number of elements in `value_ranges`. */ - unsigned n_value_ranges; - /** Value ranges, for faster lookups by numeric value. */ - const PaddleMobile__Framework__ProtobufCIntRange *value_ranges; - - /** Reserved for future use. */ - void *reserved1; - /** Reserved for future use. */ - void *reserved2; - /** Reserved for future use. */ - void *reserved3; - /** Reserved for future use. */ - void *reserved4; -}; - -/** - * Represents a single value of an enumeration. - */ -struct PaddleMobile__Framework__ProtobufCEnumValue { - /** The string identifying this value in the .proto file. */ - const char *name; - - /** The string identifying this value in generated C code. */ - const char *c_name; - - /** The numeric value assigned in the .proto file. */ - int value; -}; - -/** - * Used by `PaddleMobile__Framework__ProtobufCEnumDescriptor` to look up enum - * values. - */ -struct PaddleMobile__Framework__ProtobufCEnumValueIndex { - /** Name of the enum value. */ - const char *name; - /** Index into values[] array. */ - unsigned index; -}; - -/** - * Describes a single field in a message. - */ -struct PaddleMobile__Framework__ProtobufCFieldDescriptor { - /** Name of the field as given in the .proto file. */ - const char *name; - - /** Tag value of the field as given in the .proto file. */ - uint32_t id; - - /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */ - PaddleMobile__Framework__ProtobufCLabel label; - - /** The type of the field. */ - PaddleMobile__Framework__ProtobufCType type; - - /** - * The offset in bytes of the message's C structure's quantifier field - * (the `has_MEMBER` field for optional members or the `n_MEMBER` field - * for repeated members or the case enum for oneofs). - */ - unsigned quantifier_offset; - - /** - * The offset in bytes into the message's C structure for the member - * itself. - */ - unsigned offset; - - /** - * A type-specific descriptor. - * - * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the - * corresponding `PaddleMobile__Framework__ProtobufCEnumDescriptor`. - * - * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to - * the corresponding `PaddleMobile__Framework__ProtobufCMessageDescriptor`. - * - * Otherwise this field is NULL. - */ - const void *descriptor; /* for MESSAGE and ENUM types */ - - /** The default value for this field, if defined. May be NULL. */ - const void *default_value; - - /** - * A flag word. Zero or more of the bits defined in the - * `PaddleMobile__Framework__ProtobufCFieldFlag` enum may be set. - */ - uint32_t flags; - - /** Reserved for future use. */ - unsigned reserved_flags; - /** Reserved for future use. */ - void *reserved2; - /** Reserved for future use. */ - void *reserved3; -}; - -/** - * Helper structure for optimizing int => index lookups in the case - * where the keys are mostly consecutive values, as they presumably are for - * enums and fields. - * - * The data structures requires that the values in the original array are - * sorted. - */ -struct PaddleMobile__Framework__ProtobufCIntRange { - int start_value; - unsigned orig_index; - /* - * NOTE: the number of values in the range can be inferred by looking - * at the next element's orig_index. A dummy element is added to make - * this simple. - */ -}; - -/** - * An instance of a message. - * - * `PaddleMobile__Framework__ProtobufCMessage` is a light-weight "base class" - * for all messages. - * - * In particular, `PaddleMobile__Framework__ProtobufCMessage` doesn't have any - * allocation policy associated with it. That's because it's common to create - * `PaddleMobile__Framework__ProtobufCMessage` objects on the stack. In fact, - * that's what we recommend for sending messages. If the object is allocated - * from the stack, you can't really have a memory leak. - * - * This means that calls to functions like - * PaddleMobile__Framework__protobuf_c_message_unpack() which return a - * `PaddleMobile__Framework__ProtobufCMessage` must be paired with a call to a - * free function, like - * PaddleMobile__Framework__protobuf_c_message_free_unpacked(). - */ -struct PaddleMobile__Framework__ProtobufCMessage { - /** The descriptor for this message type. */ - const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor; - /** The number of elements in `unknown_fields`. */ - unsigned n_unknown_fields; - /** The fields that weren't recognized by the parser. */ - PaddleMobile__Framework__ProtobufCMessageUnknownField *unknown_fields; -}; - -/** - * Describes a message. - */ -struct PaddleMobile__Framework__ProtobufCMessageDescriptor { - /** Magic value checked to ensure that the API is used correctly. */ - uint32_t magic; - - /** The qualified name (e.g., "namespace.Type"). */ - const char *name; - /** The unqualified name as given in the .proto file (e.g., "Type"). */ - const char *short_name; - /** Identifier used in generated C code. */ - const char *c_name; - /** The dot-separated namespace. */ - const char *package_name; - - /** - * Size in bytes of the C structure representing an instance of this - * type of message. - */ - size_t sizeof_message; - - /** Number of elements in `fields`. */ - unsigned n_fields; - /** Field descriptors, sorted by tag number. */ - const PaddleMobile__Framework__ProtobufCFieldDescriptor *fields; - /** Used for looking up fields by name. */ - const unsigned *fields_sorted_by_name; - - /** Number of elements in `field_ranges`. */ - unsigned n_field_ranges; - /** Used for looking up fields by id. */ - const PaddleMobile__Framework__ProtobufCIntRange *field_ranges; - - /** Message initialisation function. */ - ProtobufCMessageInit message_init; - - /** Reserved for future use. */ - void *reserved1; - /** Reserved for future use. */ - void *reserved2; - /** Reserved for future use. */ - void *reserved3; -}; - -/** - * An unknown message field. - */ -struct PaddleMobile__Framework__ProtobufCMessageUnknownField { - /** The tag number. */ - uint32_t tag; - /** The wire type of the field. */ - PaddleMobile__Framework__ProtobufCWireType wire_type; - /** Number of bytes in `data`. */ - size_t len; - /** Field data. */ - uint8_t *data; -}; - -/** - * Method descriptor. - */ -struct PaddleMobile__Framework__ProtobufCMethodDescriptor { - /** Method name. */ - const char *name; - /** Input message descriptor. */ - const PaddleMobile__Framework__ProtobufCMessageDescriptor *input; - /** Output message descriptor. */ - const PaddleMobile__Framework__ProtobufCMessageDescriptor *output; -}; - -/** - * Service. - */ -struct PaddleMobile__Framework__ProtobufCService { - /** Service descriptor. */ - const PaddleMobile__Framework__ProtobufCServiceDescriptor *descriptor; - /** Function to invoke the service. */ - void (*invoke)(PaddleMobile__Framework__ProtobufCService *service, - unsigned method_index, - const PaddleMobile__Framework__ProtobufCMessage *input, - ProtobufCClosure closure, void *closure_data); - /** Function to destroy the service. */ - void (*destroy)(PaddleMobile__Framework__ProtobufCService *service); -}; - -/** - * Service descriptor. - */ -struct PaddleMobile__Framework__ProtobufCServiceDescriptor { - /** Magic value checked to ensure that the API is used correctly. */ - uint32_t magic; - - /** Service name. */ - const char *name; - /** Short version of service name. */ - const char *short_name; - /** C identifier for the service name. */ - const char *c_name; - /** Package name. */ - const char *package; - /** Number of elements in `methods`. */ - unsigned n_methods; - /** Method descriptors, in the order defined in the .proto file. */ - const PaddleMobile__Framework__ProtobufCMethodDescriptor *methods; - /** Sort index of methods. */ - const unsigned *method_indices_by_name; -}; - -/** - * Get the version of the protobuf-c library. Note that this is the version of - * the library linked against, not the version of the headers compiled against. - * - * \return A string containing the version number of protobuf-c. - */ -PROTOBUF_C__API -const char *PaddleMobile__Framework__protobuf_c_version(void); - -/** - * Get the version of the protobuf-c library. Note that this is the version of - * the library linked against, not the version of the headers compiled against. - * - * \return A 32 bit unsigned integer containing the version number of - * protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH. - */ -PROTOBUF_C__API -uint32_t PaddleMobile__Framework__protobuf_c_version_number(void); - -/** - * The version of the protobuf-c headers, represented as a string using the same - * format as PaddleMobile__Framework__protobuf_c_version(). - */ -#define PROTOBUF_C_VERSION "1.3.0" - -/** - * The version of the protobuf-c headers, represented as an integer using the - * same format as PaddleMobile__Framework__protobuf_c_version_number(). - */ -#define PROTOBUF_C_VERSION_NUMBER 1003000 - -/** - * The minimum protoc-c version which works with the current version of the - * protobuf-c headers. - */ -#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000 - -/** - * Determine the number of bytes required to store the serialised message. - * - * \param message - * The message object to serialise. - * \return - * Number of bytes. - */ -PROTOBUF_C__API -size_t PaddleMobile__Framework__protobuf_c_message_get_packed_size( - const PaddleMobile__Framework__ProtobufCMessage *message); - -/** - * Unpack a serialised message into an in-memory representation. - * - * \param descriptor - * The message descriptor. - * \param allocator - * `PaddleMobile__Framework__ProtobufCAllocator` to use for memory - * allocation. May be NULL to specify the default allocator. \param len Length - * in bytes of the serialised message. \param data Pointer to the - * serialised message. \return An unpacked message object. \retval NULL If - * an error occurred during unpacking. - */ -PROTOBUF_C__API -PaddleMobile__Framework__ProtobufCMessage * -PaddleMobile__Framework__protobuf_c_message_unpack( - const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor, - PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len, - const uint8_t *data); - -/** - * Free an unpacked message object. - * - * This function should be used to deallocate the memory used by a call to - * PaddleMobile__Framework__protobuf_c_message_unpack(). - * - * \param message - * The message object to free. May be NULL. - * \param allocator - * `PaddleMobile__Framework__ProtobufCAllocator` to use for memory - * deallocation. May be NULL to specify the default allocator. - */ -PROTOBUF_C__API -void PaddleMobile__Framework__protobuf_c_message_free_unpacked( - PaddleMobile__Framework__ProtobufCMessage *message, - PaddleMobile__Framework__ProtobufCAllocator *allocator); - -/** - * Check the validity of a message object. - * - * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present. - * Recursively checks nested messages. - * - * \retval TRUE - * Message is valid. - * \retval FALSE - * Message is invalid. - */ -PROTOBUF_C__API -protobuf_c_boolean PaddleMobile__Framework__protobuf_c_message_check( - const PaddleMobile__Framework__ProtobufCMessage *); - -/** Message initialiser. */ -#define PROTOBUF_C_MESSAGE_INIT(descriptor) \ - { descriptor, 0, NULL } - -/** - * Initialise a message object from a message descriptor. - * - * \param descriptor - * Message descriptor. - * \param message - * Allocated block of memory of size `descriptor->sizeof_message`. - */ -PROTOBUF_C__API -void PaddleMobile__Framework__protobuf_c_message_init( - const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor, - void *message); - -/** - * Initialise a `PaddleMobile__Framework__ProtobufCBufferSimple` object. - */ -#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes) \ - { \ - {PaddleMobile__Framework__protobuf_c_buffer_simple_append}, \ - sizeof(array_of_bytes), 0, (array_of_bytes), 0, NULL \ - } - -/** - * Clear a `PaddleMobile__Framework__ProtobufCBufferSimple` object, freeing any - * allocated memory. - */ -#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf) \ - do { \ - if ((simp_buf)->must_free_data) { \ - if ((simp_buf)->allocator != NULL) \ - (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \ - else \ - free((simp_buf)->data); \ - } \ - } while (0) - -/** - * The `append` method for `PaddleMobile__Framework__ProtobufCBufferSimple`. - * - * \param buffer - * The buffer object to append to. Must actually be a - * `PaddleMobile__Framework__ProtobufCBufferSimple` object. - * \param len - * Number of bytes in `data`. - * \param data - * Data to append. - */ -PROTOBUF_C__API -void PaddleMobile__Framework__protobuf_c_buffer_simple_append( - PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len, - const unsigned char *data); - -/**@}*/ - -PROTOBUF_C__END_DECLS - -#endif /* PROTOBUF_C_H */ diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt deleted file mode 100644 index 9fbf33da90..0000000000 --- a/mobile/test/CMakeLists.txt +++ /dev/null @@ -1,578 +0,0 @@ -set(dir ${CMAKE_CURRENT_SOURCE_DIR}) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build") -set(FOUND_MATCH OFF) -set(ENABLE_ALL_TEST ON) - -if (ANDROID_ABI STREQUAL "arm64-v8a") - message("using google's linker to link armv8 binary") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") -endif () - -set(CON -1) - -message(STATUS "nets :${NET}") - -list(FIND NET "net" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-net paddle-mobile) - set(FOUND_MATCH ON) -endif () - -list(FIND NET "googlenet" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-googlenet paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-googlenet-quali paddle-mobile) - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "mobilenet" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-mobilenet paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-mobilenet-combine paddle-mobile) - set(FOUND_MATCH ON) - - # gen test - ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h) - target_link_libraries(test-mobilenetgpu paddle-mobile) - -endif () - -list(FIND NET "yolo" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-yolo paddle-mobile) - # gen test - ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-yolo-combined paddle-mobile) - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "squeezenet" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-squeezenet paddle-mobile) - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "resnet" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-resnet paddle-mobile) - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "FPGA_NET_V1" CON) -if (CON GREATER -1) - #ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) - #target_link_libraries(test-resnet50 paddle-mobile) - - #ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) - #target_link_libraries(test-densebox paddle-mobile) - - #ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h) - #target_link_libraries(test-rfcn paddle-mobile) - - #ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h) - #target_link_libraries(test-marker paddle-mobile) - - ADD_EXECUTABLE(test-rfcn-api fpga/test_rfcn_api.cpp) - target_link_libraries(test-rfcn-api paddle-mobile) - - ADD_EXECUTABLE(test-mobilenet-api fpga/test_mobilenet_api.cpp) - target_link_libraries(test-mobilenet-api paddle-mobile) - - ADD_EXECUTABLE(test-yolo-api fpga/test_yolo_api.cpp) - target_link_libraries(test-yolo-api paddle-mobile) - - ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp) - target_link_libraries(test-marker-api paddle-mobile) - - #ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h ) - #target_link_libraries(test-marker2 paddle-mobile) - - #ADD_EXECUTABLE(test-mobilenet fpga/test_mobilenet_beijing.cpp test_helper.h test_include.h executor_for_test.h) - #target_link_libraries(test-mobilenet paddle-mobile) - - #ADD_EXECUTABLE(test-yolo fpga/test_yolo_combine.cpp test_helper.h test_include.h executor_for_test.h) - #target_link_libraries(test-yolo paddle-mobile) - - set(FOUND_MATCH ON) -endif () - -list(FIND NET "FPGA_NET_V2" CON) -if (CON GREATER -1) - ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-resnet50 paddle-mobile) - - ADD_EXECUTABLE(test-pe fpga/test_pe.cpp) - target_link_libraries(test-pe paddle-mobile) - - ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-densebox paddle-mobile) - - set(FOUND_MATCH ON) -endif () - -list(FIND NET "FPGA_OPS_KD" CON) -if (CON GREATER -1) - ADD_EXECUTABLE(test-ssd fpga/test_ssd.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-ssd paddle-mobile) - - set(FOUND_MATCH ON) -endif () - -list(FIND NET "mobilenetssd" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-mobilenetssd paddle-mobile) - - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "nlp" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-nlp paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h) - target_link_libraries(test-gru-op paddle-mobile) - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "mobilenetfssd" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h) - target_link_libraries(test-fssd paddle-mobile) - - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "genet" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-genet paddle-mobile) - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "super" CON) -if (CON GREATER -1) - # gen test - ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-super paddle-mobile) - set(FOUND_MATCH ON) - -endif () - -list(FIND NET "op" CON) -if (CON GREATER -1) - # # gen test - # ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h) - # target_link_libraries(test-sigmoid paddle-mobile) - # - # # gen test log - # ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp) - # target_link_libraries(test-leakyrelu paddle-mobile) - # gen test log - ADD_EXECUTABLE(test-log common/test_log.cpp) - target_link_libraries(test-log paddle-mobile) - set(FOUND_MATCH ON) -endif () - -if (ENABLE_ALL_TEST) - if (NOT FOUND_MATCH) - # gen test - ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-resnet paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-squeezenet paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-yolo paddle-mobile) - - # gen test - ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test_yolo_combined paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-op-in-net net/test_op_in_net.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-op-in-net paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-googlenet paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-googlenet-quali paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-conv-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-expend-op operators/test_expend_op.cpp test_helper.h test_include.h executor_for_test_opencl.h) - target_link_libraries(test-expend-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h) - target_link_libraries(test-mul-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h) - target_link_libraries(test-elementwiseadd-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h) - target_link_libraries(test-elementwisesub-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h) - target_link_libraries(test-im2sequence-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h) - target_link_libraries(test-concat-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h) - target_link_libraries(test-lrn-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h) - target_link_libraries(test-batchnorm-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h) - target_link_libraries(test-priorbox-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h) - target_link_libraries(test-boxcoder-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h) - target_link_libraries(test-transpose-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h) - target_link_libraries(test-transpose2-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h) - target_link_libraries(test-multiclassnms-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h) - target_link_libraries(test-polygon-box-transform-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h) - target_link_libraries(test-fill-constant-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h) - target_link_libraries(test-reshape-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h) - target_link_libraries(test-reshape2-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h) - target_link_libraries(test-relu-op paddle-mobile) - - ADD_EXECUTABLE(test-relu6-op operators/test_relu6_op.cpp test_helper.h test_include.h) - target_link_libraries(test-relu6-op paddle-mobile) - - ADD_EXECUTABLE(test-tanh-op operators/test_tanh_op.cpp test_helper.h test_include.h) - target_link_libraries(test-tanh-op paddle-mobile) - - ADD_EXECUTABLE(test-log-op operators/test_log_op.cpp test_helper.h test_include.h) - target_link_libraries(test-log-op paddle-mobile) - - ADD_EXECUTABLE(test-topk-op operators/test_topk_op.cpp test_helper.h test_include.h) - target_link_libraries(test-topk-op paddle-mobile) - - ADD_EXECUTABLE(test-cast-op operators/test_cast_op.cpp test_helper.h test_include.h) - target_link_libraries(test-cast-op paddle-mobile) - - ADD_EXECUTABLE(test-less-than-op operators/test_less_than_op.cpp test_helper.h test_include.h) - target_link_libraries(test-less-than-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h) - target_link_libraries(test-fc-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h) - target_link_libraries(test-sum-op paddle-mobile) - - # test quantize op - ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h) - target_link_libraries(test-quantize-op paddle-mobile) - - # test dequantize op - ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h) - target_link_libraries(test-dequantize-op paddle-mobile) - - # gen test log - ADD_EXECUTABLE(test-log common/test_log.cpp) - target_link_libraries(test-log paddle-mobile) - - # gen test log - ADD_EXECUTABLE(test-load framework/test_load.cpp) - target_link_libraries(test-load paddle-mobile) - - # gen test log - ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp) - target_link_libraries(test-loadmemory paddle-mobile) - - # gen test log - ADD_EXECUTABLE(test-loadmemory-inference framework/test_load_memory_inference_api.cpp) - target_link_libraries(test-loadmemory-inference paddle-mobile) - - ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp) - target_link_libraries(test-inference-api paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp) - target_link_libraries(test-optimize paddle-mobile) - - #gen test - ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-pool-op paddle-mobile) - - #gen test - ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-softmax-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp) - target_link_libraries(test-gemm-accuracy paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp) - target_link_libraries(test-gemm-int8-accuracy paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp) - target_link_libraries(test-gemm-perf paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-enforce common/test_enforce.cpp) - target_link_libraries(test-enforce paddle-mobile) - - # gen test - test if openmp works - ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-openmp paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-mobilenetssd paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-mobilenet-combine paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-genet paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-sigmoid-op operators/test_sigmoid_op.cpp test_include.h) - target_link_libraries(test-sigmoid-op paddle-mobile) - - # gen test log - ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp) - target_link_libraries(test-leakyrelu paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-depthwise-conv-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-mobilenet paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-conv-add-relu-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-conv-add-bn-relu-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-nlp paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h) - target_link_libraries(test-gru-op paddle-mobile) - - # gen test - - ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-inceptionv4 paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-alexnet paddle-mobile) - - ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h) - target_link_libraries(test-googlenetv1 paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h) - target_link_libraries(test-fssd paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h) - target_link_libraries(test-mobilenetgpu paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-yologpu paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h) - target_link_libraries(test-multi-process paddle-mobile) - - # gen test benchmark - ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp) - target_link_libraries(test-benchmark paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h) - target_link_libraries(test-eng paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h) - target_link_libraries(test-super paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h) - target_link_libraries(test-ocr paddle-mobile) - - ADD_EXECUTABLE(test-gesture net/test_gesture.cpp test_helper.h test_include.h) - target_link_libraries(test-gesture paddle-mobile) - - ADD_EXECUTABLE(test-sequence-expand-op operators/test_sequence_expand_op.cpp test_helper.h test_include.h) - target_link_libraries(test-sequence-expand-op paddle-mobile) - - ADD_EXECUTABLE(test-sequence-pool-op operators/test_sequence_pool_op.cpp test_helper.h test_include.h) - target_link_libraries(test-sequence-pool-op paddle-mobile) - - ADD_EXECUTABLE(test-sequence-softmax-op operators/test_sequence_softmax_op.cpp test_helper.h test_include.h) - target_link_libraries(test-sequence-softmax-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-vgg16ssd net/test_vgg16ssd.cpp test_helper.h test_include.h) - target_link_libraries(test-vgg16ssd paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-logical-and-op operators/test_logical_and_op.cpp test_helper.h test_include.h) - target_link_libraries(test-logical-and-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-logical-or-op operators/test_logical_or_op.cpp test_helper.h test_include.h) - target_link_libraries(test-logical-or-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-logical-not-op operators/test_logical_not_op.cpp test_helper.h test_include.h) - target_link_libraries(test-logical-not-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h) - target_link_libraries(test-logical-xor-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-increment-op operators/test_increment_op.cpp test_helper.h test_include.h) - target_link_libraries(test-increment-op paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-is-empty-op operators/test_is_empty_op.cpp test_helper.h test_include.h) - target_link_libraries(test-is-empty-op paddle-mobile) - - ADD_EXECUTABLE(test-conv-bn-relu-op operators/test_conv_bn_relu_op.cpp test_helper.h test_include.h) - target_link_libraries(test-conv-bn-relu-op paddle-mobile) - - ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h) - target_link_libraries(test-dwconv-bn-relu-op paddle-mobile) - - ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h) - target_link_libraries(test-conv-gpu paddle-mobile) - - ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h) - target_link_libraries(test-net-benchmark paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-net paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-net-feeds net/test_net_multi_feed.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-net-feeds paddle-mobile) - - # gen test - ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-net-performance paddle-mobile) - - ADD_EXECUTABLE(test-infer-imfix net/test_inference_imfix.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-infer-imfix paddle-mobile) - -# ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h) -# target_link_libraries(test-inference-api-v2 paddle-mobile) - - if (GPU_CL) - ADD_EXECUTABLE(test-net-male2fe net/test_mobilenet_male2fe.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-net-male2fe paddle-mobile) - - ADD_EXECUTABLE(test-infer-m2fm net/test_inference_m2fm.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-infer-m2fm paddle-mobile) - - endif() - - endif () -else () - # gen test - ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-net paddle-mobile) - - ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h) - target_link_libraries(test-net-benchmark paddle-mobile) - -# ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h) -# target_link_libraries(test-inference-api-v2 paddle-mobile) -endif () diff --git a/mobile/test/common/test_enforce.cpp b/mobile/test/common/test_enforce.cpp deleted file mode 100644 index 9bb499315d..0000000000 --- a/mobile/test/common/test_enforce.cpp +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "common/enforce.h" - -int main() { - PADDLE_MOBILE_ENFORCE(false, "enforce"); - PADDLE_MOBILE_THROW_EXCEPTION("throw a exception"); - return 0; -} diff --git a/mobile/test/common/test_gemm_accuracy.cpp b/mobile/test/common/test_gemm_accuracy.cpp deleted file mode 100644 index fc1041bde0..0000000000 --- a/mobile/test/common/test_gemm_accuracy.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "../test_helper.h" -#include "common/log.h" -#include "memory/t_malloc.h" -#include "operators/math/gemm/cblas.h" - -#define a(i, j) a[(i)*lda + (j)] -#define b(i, j) b[(i)*ldb + (j)] -#define c(i, j) c[(i)*ldc + (j)] -#define c1(i, j) c1[(i)*ldc + (j)] - -void print_matrix(int m, int n, int ldc, float *c) { - for (int i = 0; i < m; ++i) { - std::cout << c(i, 0); - for (int j = 1; j < n; ++j) { - std::cout << " | " << c(i, j); - } - std::cout << std::endl; - } - std::cout << std::endl; -} - -int do_sgemm(int m, int n, int k, int pr) { - const float alpha = 1.f; - const float beta = 0.f; - const int lda = k; - const int ldb = n; - const int ldc = n; - - float *a = - static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * k)); - float *b = - static_cast(paddle_mobile::memory::Alloc(sizeof(float) * k * n)); - float *c = - static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); - float *c1 = - static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); - - std::mt19937 rng(111); - std::uniform_real_distribution uniform_dist(0, 1); - const float lower = -10.f; - const float upper = 10.f; - - for (int i = 0; i < m * k; ++i) { - a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); - } - for (int i = 0; i < k * n; ++i) { - b[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); - } - memcpy(c, c1, sizeof(float) * m * n); - - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - float r = 0; - for (int p = 0; p < k; p++) { - r += a(i, p) * b(p, j); - } - c1(i, j) = alpha * r; - } - } - - std::cout << "run cblas_sgemm..." << std::endl; - paddle_mobile::operators::math::cblas_sgemm(false, false, m, n, k, alpha, a, - lda, b, ldb, 0.f, c, ldc); - - std::cout << "compare results..." << std::endl; - for (int i = 0; i < m * n; ++i) { - if (abs(c[i] - c1[i]) >= 1e-2) { - std::cout << "c[" << i << "] != c1[" << i << "]: " << c[i] << " vs " - << c1[i] << std::endl; - exit(1); - } - } - - if (pr > 0) { - std::cout << "A:" << std::endl; - print_matrix(m, k, lda, a); - std::cout << "B:" << std::endl; - print_matrix(k, n, ldb, b); - std::cout << "C:" << std::endl; - print_matrix(m, n, ldc, c); - std::cout << "C1:" << std::endl; - print_matrix(m, n, ldc, c1); - } - - paddle_mobile::memory::Free(a); - paddle_mobile::memory::Free(b); - paddle_mobile::memory::Free(c); - paddle_mobile::memory::Free(c1); - - return 0; -} - -int main(int argc, char *argv[]) { - do_sgemm(1, 1, 1, 1); - - do_sgemm(9, 9, 1, 1); - do_sgemm(999, 99, 1, 0); - do_sgemm(999, 1, 1, 0); - do_sgemm(1, 9, 9, 1); - do_sgemm(1, 99, 999, 0); - do_sgemm(1, 1, 999, 0); - - do_sgemm(9, 9, 9, 1); - do_sgemm(10, 6, 12, 1); - do_sgemm(512, 256, 384, 0); - do_sgemm(1366, 768, 256, 0); - do_sgemm(1255, 755, 333, 0); - do_sgemm(555, 777, 999, 0); - - do_sgemm(10, 6, 12, 1); - do_sgemm(512, 256, 384, 0); - do_sgemm(1366, 768, 256, 0); - do_sgemm(1255, 755, 333, 0); - do_sgemm(555, 777, 999, 0); - - return 0; -} diff --git a/mobile/test/common/test_gemm_int8_accuracy.cpp b/mobile/test/common/test_gemm_int8_accuracy.cpp deleted file mode 100644 index 7d20a178c1..0000000000 --- a/mobile/test/common/test_gemm_int8_accuracy.cpp +++ /dev/null @@ -1,346 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include "../test_helper.h" -#include "common/log.h" -#include "memory/t_malloc.h" -#include "operators/math/gemm.h" -#ifdef _OPENMP -#include -#endif // _OPENMP - -#define a(i, j) a[(i)*lda + (j)] -#define b(i, j) b[(i)*ldb + (j)] -#define c(i, j) c[(i)*ldc + (j)] -#define c1(i, j) c1[(i)*ldc + (j)] - -using std::default_random_engine; -using std::uniform_int_distribution; - -template -void print_matrix(int m, int n, int ldc, T *c) { - for (int i = 0; i < m; ++i) { - if (std::is_same::value) { - std::cout.setf(std::ios::left); - std::cout.width(4); - std::cout << static_cast(c(i, 0)); - } else { - std::cout.setf(std::ios::left); - std::cout.width(6); - std::cout << c(i, 0); - } - for (int j = 1; j < n; ++j) { - if (std::is_same::value) { - std::cout << " | "; - std::cout.setf(std::ios::left); - std::cout.width(4); - std::cout << static_cast(c(i, j)); - } else { - std::cout << " | "; - std::cout.setf(std::ios::left); - std::cout.width(6); - std::cout << c(i, j); - } - } - std::cout << "\n"; - } - std::cout << std::endl; -} - -int32_t qadd_int32(int32_t l, int32_t r) { - int64_t res = static_cast(l) + static_cast(r); - if (res > std::numeric_limits::max()) - return std::numeric_limits::max(); - else if (res < std::numeric_limits::min()) - return std::numeric_limits::min(); - else - return static_cast(res); -} - -// round to zero -float round2zero(float v) { - float res; - if (v > 0) - res = std::floor(v); - else if (v < 0) - res = std::ceil(v); - return res; -} - -int8_t qscale_int32(int32_t v, float scale) { - float res = static_cast(v) * scale; - res = round2zero(res); - if (res > 127) - return static_cast(127); - else if (res < -127) - return static_cast(-127); - else - return static_cast(res); -} - -int do_sgemm(int m, int n, int k, bool relu, int pr) { - int lda = k; - int ldb = n; - int ldc = n; - default_random_engine e; - uniform_int_distribution pixel(-127, 127); - int8_t *a = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k)); - int8_t *b = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n)); - int32_t *c = static_cast( - paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n)); - int32_t *c1 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n)); - - for (int i = 0; i < m * k; ++i) { - a[i] = pixel(e); - } - for (int i = 0; i < k * n; ++i) { - b[i] = pixel(e); - } - - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - int32_t r = 0; - for (int p = 0; p < k; p++) { - r += static_cast(a(i, p)) * static_cast(b(p, j)); - } - c1(i, j) = r; - } - } - - paddle_mobile::operators::math::Gemm gemm; -#ifdef _OPENMP - gemm.Sgemm_omp(m, n, k, static_cast(1), a, lda, b, ldb, - static_cast(0), c, ldc, relu, nullptr); -#else - gemm.Sgemm(m, n, k, static_cast(1), a, lda, b, ldb, - static_cast(0), c, ldc, relu, nullptr); -#endif - int eq = 0; - int neq = 0; - for (int i = 0; i < m * n; ++i) { - if (c[i] == c1[i]) { - ++eq; - } else { - ++neq; - } - } - - if (pr > 0) { - std::cout << "A:" << std::endl; - print_matrix(m, k, lda, a); - std::cout << "B:" << std::endl; - print_matrix(k, n, ldb, b); - std::cout << "C:" << std::endl; - print_matrix(m, n, ldc, c); - std::cout << "C1:" << std::endl; - print_matrix(m, n, ldc, c1); - } - - std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu - << " eq=" << eq << " neq=" << neq << std::endl; - - PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!"); - - paddle_mobile::memory::Free(a); - paddle_mobile::memory::Free(b); - paddle_mobile::memory::Free(c); - paddle_mobile::memory::Free(c1); - - return 0; -} - -int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr, - bool addOnRow = false) { - int lda = k; - int ldb = n; - int ldc = n; - float scale = 1; - default_random_engine e; - uniform_int_distribution pixel(-127, 127); - int8_t *a = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k)); - int8_t *b = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n)); - int8_t *c = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n)); - int8_t *c1 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n)); - - int32_t *bias = nullptr; - if (addOnRow) { - bias = static_cast( - paddle_mobile::memory::Alloc(sizeof(int32_t) * n)); - } else { - bias = static_cast( - paddle_mobile::memory::Alloc(sizeof(int32_t) * m)); - } - - for (int i = 0; i < m * k; ++i) { - a[i] = pixel(e); - } - for (int i = 0; i < k * n; ++i) { - b[i] = pixel(e); - } - - if (addOnRow) { - for (int i = 0; i < n; ++i) { - bias[i] = static_cast(pixel(e)); - } - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - int32_t bias_v = bias[j]; - int32_t r = 0; - for (int p = 0; p < k; p++) { - r += static_cast(a(i, p)) * static_cast(b(p, j)); - } - r = qadd_int32(r, bias_v); - if (relu) r = std::max(0, r); - c1(i, j) = qscale_int32(r, scale); - } - } - } else { - for (int i = 0; i < m; ++i) { - bias[i] = static_cast(pixel(e)); - } - for (int i = 0; i < m; ++i) { - int32_t bias_v = bias[i]; - for (int j = 0; j < n; ++j) { - int32_t r = 0; - for (int p = 0; p < k; p++) { - r += static_cast(a(i, p)) * static_cast(b(p, j)); - } - r = qadd_int32(r, bias_v); - if (relu) r = std::max(0, r); - c1(i, j) = qscale_int32(r, scale); - } - } - } - - paddle_mobile::operators::math::Gemm gemm; -#ifdef _OPENMP - gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast(0), c, ldc, - relu, bias, addOnRow); -#else - gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast(0), c, ldc, - relu, bias, addOnRow); -#endif - int eq = 0; - int neq = 0; - for (int i = 0; i < m * n; ++i) { - if (c[i] == c1[i]) { - ++eq; - } else { - ++neq; - } - } - - if (pr > 0) { - std::cout << "A:" << std::endl; - print_matrix(m, k, lda, a); - std::cout << "B:" << std::endl; - print_matrix(k, n, ldb, b); - std::cout << "Bias:" << std::endl; - if (addOnRow) { - print_matrix(1, n, n, bias); - } else { - print_matrix(m, 1, 1, bias); - } - std::cout << "C:" << std::endl; - print_matrix(m, n, ldc, c); - std::cout << "C1:" << std::endl; - print_matrix(m, n, ldc, c1); - } - - std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu - << " eq=" << eq << " neq=" << neq << std::endl; - - PADDLE_MOBILE_ENFORCE(neq == 0, - "The execution of do_sgemm_with_bias is failed!"); - - paddle_mobile::memory::Free(a); - paddle_mobile::memory::Free(b); - paddle_mobile::memory::Free(c); - paddle_mobile::memory::Free(c1); - paddle_mobile::memory::Free(bias); - - return 0; -} - -int main() { -#ifdef _OPENMP - omp_set_num_threads(4); -#endif - std::cout << "\n\n******************************************************\n\n" - << std::endl; - std::cout << "Test gemm without bias:" << std::endl; - do_sgemm(9, 9, 9, false, 1); - do_sgemm(10, 6, 12, false, 0); - do_sgemm(512, 256, 384, false, 0); - do_sgemm(1366, 768, 256, false, 0); - do_sgemm(1255, 755, 333, false, 0); - do_sgemm(599, 1133, 393, false, 0); - do_sgemm(777, 555, 999, false, 0); - do_sgemm(333, 797, 939, false, 0); - do_sgemm(1024, 1024, 1024, false, 0); - - std::cout << "\n\n******************************************************\n\n" - << std::endl; - std::cout << "Test gemm with bias(bias is added on column):" << std::endl; - do_sgemm_with_bias(9, 9, 9, false, 1); - do_sgemm_with_bias(10, 6, 12, false, 0); - do_sgemm_with_bias(512, 256, 384, false, 0); - do_sgemm_with_bias(1366, 768, 256, false, 0); - do_sgemm_with_bias(1255, 755, 333, false, 0); - do_sgemm_with_bias(599, 1133, 393, false, 0); - do_sgemm_with_bias(777, 555, 999, false, 0); - do_sgemm_with_bias(333, 797, 939, false, 0); - do_sgemm_with_bias(1024, 1024, 1024, false, 0); - - std::cout << "\n\n******************************************************\n\n" - << std::endl; - std::cout << "Test gemm with bias(bias is added on row):" << std::endl; - do_sgemm_with_bias(9, 9, 9, false, 1, true); - do_sgemm_with_bias(10, 6, 12, false, 0, true); - do_sgemm_with_bias(512, 256, 384, false, 0, true); - do_sgemm_with_bias(1366, 768, 256, false, 0, true); - do_sgemm_with_bias(1255, 755, 333, false, 0, true); - do_sgemm_with_bias(599, 1133, 393, false, 0, true); - do_sgemm_with_bias(777, 555, 999, false, 0, true); - do_sgemm_with_bias(333, 797, 939, false, 0, true); - do_sgemm_with_bias(1024, 1024, 1024, false, 0, true); - - std::cout << "\n\n******************************************************\n\n" - << std::endl; - std::cout << "Test gemm with relu and bias:" << std::endl; - do_sgemm_with_bias(9, 9, 9, true, 1); - do_sgemm_with_bias(10, 6, 12, true, 0); - do_sgemm_with_bias(512, 256, 384, true, 0); - do_sgemm_with_bias(1366, 768, 256, true, 0); - do_sgemm_with_bias(1255, 755, 333, true, 0); - do_sgemm_with_bias(599, 1133, 393, true, 0); - do_sgemm_with_bias(777, 555, 999, true, 0); - do_sgemm_with_bias(333, 797, 939, true, 0); - do_sgemm_with_bias(1024, 1024, 1024, true, 0); - - return 0; -} diff --git a/mobile/test/common/test_gemm_perf.cpp b/mobile/test/common/test_gemm_perf.cpp deleted file mode 100644 index c88a65625d..0000000000 --- a/mobile/test/common/test_gemm_perf.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/math/gemm.h" -#include "operators/math/math_function.h" - -#define a(i, j) a[(i)*lda + (j)] -#define b(i, j) b[(i)*ldb + (j)] -#define c1(i, j) c1[(i)*ldc + (j)] - -#define m 1024 -#define n 1024 -#define k 1024 - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - Tensor aa, bb, cc; - auto aaptr = aa.mutable_data({m, k}); - auto bbptr = bb.mutable_data({k, n}); - auto ccptr = cc.mutable_data({m, n}); - - for (int i = 0; i < m * k; ++i) { - aaptr[i] = 2; - } - for (int i = 0; i < k * n; ++i) { - bbptr[i] = 2; - } - for (int i = 0; i < m * n; ++i) { - ccptr[i] = 2; - } - - Tensor aa_int8, bb_int8, cc_int32, cc_int8; - auto aaptr_int8 = aa_int8.mutable_data({m, k}); - auto bbptr_int8 = bb_int8.mutable_data({k, n}); - auto ccptr_int32 = cc_int32.mutable_data({m, n}); - auto ccptr_int8 = cc_int8.mutable_data({m, n}); - int32_t* bias_data_col = new int32_t[m]; - int32_t* bias_data_row = new int32_t[n]; - - for (int i = 0; i < m * k; ++i) { - aaptr_int8[i] = static_cast(2); - } - for (int i = 0; i < k * n; ++i) { - bbptr_int8[i] = static_cast(2); - } - for (int i = 0; i < m * n; ++i) { - ccptr_int32[i] = static_cast(2); - } - - for (int i = 0; i < m; ++i) { - bias_data_col[i] = 2; - } - - for (int i = 0; i < n; ++i) { - bias_data_row[i] = 2; - } - - // float - // warm-up 10 times - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa, false, bb, false, static_cast(1), &cc, static_cast(0), - false, nullptr); - } - - auto time_start0 = time(); - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa, false, bb, false, static_cast(1), &cc, static_cast(0), - false, nullptr); - } - auto time_end0 = time(); - std::cout << "float gemm cost :" << time_diff(time_start0, time_end0) / 10 - << "ms\n"; - - // int8_t without bias - // warm-up 10 times - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa_int8, false, bb_int8, false, static_cast(1), &cc_int32, - static_cast(0)); - } - - auto time_start1 = time(); - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa_int8, false, bb_int8, false, static_cast(1), &cc_int32, - static_cast(0)); - } - auto time_end1 = time(); - std::cout << "int8_t gemm cost :" << time_diff(time_start1, time_end1) / 10 - << "ms\n"; - - // int8_t with bias, column element wise add - // warm-up 10 times - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, - static_cast(0), false, bias_data_col, false); - } - auto time_start2 = time(); - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, - static_cast(0), false, bias_data_col, false); - } - auto time_end2 = time(); - std::cout << "int8_t gemm_with_bias(column add) cost :" - << time_diff(time_start2, time_end2) / 10 << "ms\n"; - - // int8_t with bias, row element wise add - // warm-up 10 times - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, - static_cast(0), false, bias_data_row, true); - } - auto time_start3 = time(); - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, - static_cast(0), false, bias_data_row, true); - } - auto time_end3 = time(); - std::cout << "int8_t gemm_with_bias(row add) cost :" - << time_diff(time_start3, time_end3) / 10 << "ms\n"; - - // int8_t with bias&relu - // warm-up 10 times - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, - static_cast(0), true, bias_data_col, false); - } - auto time_start4 = time(); - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::MatMul( - aa_int8, false, bb_int8, false, static_cast(0.618), &cc_int8, - static_cast(0), true, bias_data_col, false); - } - auto time_end4 = time(); - std::cout << "int8_t gemm_with_bias_relu cost :" - << time_diff(time_start4, time_end4) / 10 << "ms\n"; - - delete[] bias_data_row; - delete[] bias_data_col; - - return 0; -} diff --git a/mobile/test/common/test_lib_size.cpp b/mobile/test/common/test_lib_size.cpp deleted file mode 100644 index 805668f359..0000000000 --- a/mobile/test/common/test_lib_size.cpp +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// -// Created by liuRuiLong on 2018/6/6. -// - -#include "test_lib_size.h" - -static test_lib_size t; diff --git a/mobile/test/common/test_lib_size.h b/mobile/test/common/test_lib_size.h deleted file mode 100644 index a00a5afe12..0000000000 --- a/mobile/test/common/test_lib_size.h +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// -// Created by liuRuiLong on 2018/6/6. -// - -#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H -#define PADDLE_MOBILE_TEST_LIB_SIZE_H - -#include -#include -#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include - -//#include -//#include -//#include -//#include -//#include - -void foo() { - // char *str = "1234"; - // char dst[10]; - // strcpy(dst, str); - - // std::cout << "12345" << std::endl; - std::vector vec = {1, 2, 3, 4, 5}; - vec.push_back(2); - - pthread_mutex_init(NULL, NULL); - pthread_attr_destroy(NULL); - // std::find(vec.begin(), vec.end(), 1); - - // std::list l; - // std::mutex mutex_; - - // std::map m; - // std::unordered_map u_m; - // std::unordered_set u_s; - // std::string ss = "12345"; - // printf("%f", ss.c_str()); - - // std::initializer_list init_list = {1, 2}; - // std::tuple t = {1, 2}; - - // std::tuple_element>::type - - // std::tuple<> - - // int i; - // int j; - // if (typeid(i) == typeid(j)){ - // int z = 10; - // } - - // std::shared_ptr s1 = std::make_shared(); - - // std::stringstream ss; - // ss << "12345"; -} - -class test_lib_size { - public: - test_lib_size() {} - // std::shared_ptr Test(){ - // std::vector vec = {1, 2, 3}; - // std::shared_ptr si = std::make_shared(); - // return si; - // } - - // void test(){ - // int i = 9; - // } -}; - -#endif // PADDLE_MOBILE_TEST_LIB_SIZE_H diff --git a/mobile/test/common/test_log.cpp b/mobile/test/common/test_log.cpp deleted file mode 100644 index 7ba964d18b..0000000000 --- a/mobile/test/common/test_log.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "common/log.h" - -int main() { - LOG(paddle_mobile::kLOG_DEBUG3) << "test debug" - << " next log"; - LOG(paddle_mobile::kLOG_DEBUG) << "test debug" - << " next log"; - - LOG(paddle_mobile::kLOG_DEBUG1) << "test debug1" - << " next log"; - LOG(paddle_mobile::kLOG_DEBUG2) << "test debug2" - << " next log"; - LOG(paddle_mobile::kLOG_INFO) << "INFO!!!"; - LOG(paddle_mobile::kLOG_WARNING) << "WARNING!!!"; - LOG(paddle_mobile::kLOG_VERBOSE) << "VERBOSE!!!"; - DLOG << "test DLOG"; - - LOG(paddle_mobile::kLOG_ERROR) << "ERROR !"; - - return 0; -} diff --git a/mobile/test/common/test_openmp.cpp b/mobile/test/common/test_openmp.cpp deleted file mode 100644 index 790c434101..0000000000 --- a/mobile/test/common/test_openmp.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -//#include -#include - -int main(void) { -#ifdef PADDLE_MOBILE_USE_OPENMP - #pragma omp parallel num_threads(2) - { - // int thread_id = omp_get_thread_num(); - // int nthreads = omp_get_num_threads(); - // std::cout << "Hello, OMP " << thread_id << "/" << nthreads << - // "\n"; - } -#endif - return 0; -} diff --git a/mobile/test/executor_for_test.h b/mobile/test/executor_for_test.h deleted file mode 100644 index 0a67eea5d5..0000000000 --- a/mobile/test/executor_for_test.h +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "common/log.h" -#include "framework/executor.h" -#include "framework/op_registry.h" -#include "operators/activation_op.h" -#include "operators/conv_op.h" -#include "operators/elementwise_add_op.h" -#include "operators/pool_op.h" -#include "operators/reshape_op.h" -#include "operators/softmax_op.h" -#include "operators/transpose_op.h" - -using paddle_mobile::framework::BlockDesc; -using paddle_mobile::framework::DDim; -using paddle_mobile::framework::Executor; -using paddle_mobile::framework::LoDTensor; -using paddle_mobile::framework::OpDesc; -using paddle_mobile::framework::Program; -using paddle_mobile::framework::Tensor; -using paddle_mobile::framework::Variable; -using std::string; -using std::vector; - -template -class Executor4Test : public Executor { - public: - Executor4Test(Program p, string op_type, - bool use_optimize = false) - : Executor() { - this->use_optimize_ = use_optimize; - this->program_ = p; - if (this->use_optimize_) { - this->program_desc_ = this->program_.optimizeProgram; - } else { - this->program_desc_ = this->program_.originProgram; - } - - if (this->program_.originProgram == nullptr) { - LOG(paddle_mobile::LogLevel::kLOG_ERROR) << "program_desc_ == nullptr"; - } - - const std::vector> &blocks = - this->program_desc_->Blocks(); - std::vector> ops = blocks[0]->Ops(); - for (int i = 0; i < ops.size(); ++i) { - auto op = ops[i]; - if (op->Type() == op_type) { - DLOG << "匹配到: " << op->Type(); - - /// test first meeting op in program - std::shared_ptr> - op_ptr = paddle_mobile::framework::OpRegistry::CreateOp( - op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), - this->program_.scope.get()); - this->ops_of_block0_.push_back(op_ptr); - break; - } - } - if (this->program_.combined) { - this->InitCombineMemory(); - } else { - this->InitMemory(); - } - for (const auto &op : this->ops_of_block0_) { - op->Init(); - } - } - - template - vector> Predict(const vector &ts, - const vector &input_names, - const vector &output_names, - const vector &ddims) { - auto scope = this->program_.scope.get(); - size_t input_size = input_names.size(); - size_t out_size = output_names.size(); - - vector input_vars(input_size); - vector input_tensors(input_size); - for (int i = 0; i < input_size; i++) { - input_vars[i] = scope->Var(input_names[i]); - input_tensors[i] = input_vars[i]->GetMutable(); - input_tensors[i]->ShareDataWith(ts[i]); - } - - vector output_vars(out_size); - vector output_tensors(out_size); - vector> output_tensor_sptrs(out_size); - - for (int i = 0; i < out_size; i++) { - output_vars[i] = scope->Var(output_names[i]); - output_tensors[i] = output_vars[i]->GetMutable(); - output_tensors[i]->mutable_data(ddims[i]); - output_tensor_sptrs[i] = std::make_shared(); - output_tensor_sptrs[i].reset(output_tensors[i]); - } - - for (auto &op : this->ops_of_block0_) { - op->Run(); - } - - return output_tensor_sptrs; - } - - std::shared_ptr Predict(const Tensor &t, string input, string output, - const DDim &dDim) { - auto scope = this->program_.scope.get(); - Variable *g_feed_value = scope->Var(input); - auto tensor = g_feed_value->GetMutable(); - tensor->ShareDataWith(t); - - Variable *con_output = scope->Var(output); - auto *output_tensor = con_output->GetMutable(); - output_tensor->mutable_data(dDim); - - for (auto &op : this->ops_of_block0_) { - op->Run(); - } - - return std::make_shared( - paddle_mobile::framework::Tensor(*output_tensor)); - } -}; diff --git a/mobile/test/executor_for_test_opencl.h b/mobile/test/executor_for_test_opencl.h deleted file mode 100644 index 3a8af87592..0000000000 --- a/mobile/test/executor_for_test_opencl.h +++ /dev/null @@ -1,163 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef PADDLE_MOBILE_CL - -#include -#include -#include - -#include "./test_helper.h" -#include "common/log.h" -#include "framework/cl/cl_helper.h" -#include "framework/cl/cl_tensor.h" -#include "framework/executor.h" -#include "framework/op_registry.h" -#include "operators/feed_op.h" -#include "operators/fetch_op.h" - -using paddle_mobile::framework::AttributeMap; -using paddle_mobile::framework::BlockDesc; -using paddle_mobile::framework::DDim; -using paddle_mobile::framework::Executor; -using paddle_mobile::framework::LoDTensor; -using paddle_mobile::framework::OpDesc; -using paddle_mobile::framework::OperatorBase; -using paddle_mobile::framework::Program; -using paddle_mobile::framework::Tensor; -using paddle_mobile::framework::Variable; -using std::string; -using std::vector; -namespace paddle_mobile { -template -class OpenClOpTester { - public: - OpenClOpTester() { - framework::CLEngine::Instance()->setClPath("/data/local/tmp/bin"); - scope_ = std::make_shared(); - feed_clhelper_ = framework::CLHelper(scope_->GetCLScpoe()); - fetch_clhelper_ = framework::CLHelper(scope_->GetCLScpoe()); - this->feed_clhelper_.AddKernel("feed", "feed_kernel.cl"); - this->fetch_clhelper_.AddKernel("fetch", "fetch_kernel.cl"); - - feed_var = scope_.get()->Var("feed"); - fetch_var = scope_.get()->Var("fetch"); - op_in_var = scope_.get()->Var("op_in"); - op_out_var = scope_.get()->Var("op_out"); - } - - void Predict(string op_type, DDim feed_dims, DDim fetch_dims, - VariableNameMap inputs_feed, VariableNameMap outputs_feed, - AttributeMap attrs_feed) { - framework::CLImage *const op_in_cl_image = - op_in_var->template GetMutable(); - op_in_cl_image->Resize(feed_dims); - op_in_cl_image->InitEmptyImage(feed_clhelper_.CLContext(), - feed_clhelper_.CLCommandQueue(), feed_dims); - framework::CLImage *const op_out_cl_image = - op_out_var->template GetMutable(); - op_out_cl_image->Resize(fetch_dims); - framework::CLScope *const clScpoe = scope_->GetCLScpoe(); - op_out_cl_image->InitEmptyImage(clScpoe->Context(), clScpoe->CommandQueue(), - fetch_dims); - - Feed(feed_dims); - auto *op = new OpType(op_type, inputs_feed, outputs_feed, attrs_feed, - scope_.get()); - op->InferShape(); - op->Init(); - op->Run(); - Fetch(fetch_dims); - } - void Feed(DDim feed_dims) { - auto *feed_var = scope_->Var("feed"); - auto *_var = scope_->Var("op_in"); - auto *const input = feed_var->template GetMutable(); - DLOG << "feed_dims: " << feed_dims; - SetupTensor(input, feed_dims, -100.0, 100.0); - framework::CLImage *const op_in_cl_image = - op_in_var->template GetMutable(); - DLOG << "FeedKernel run "; - DLOG << "params.input " << *input; - DLOG << "params.op_in_cl_image " << *op_in_cl_image; - auto kernel = this->feed_clhelper_.KernelAt(0); - DLOG << "kernel get success "; - - auto default_work_size = - this->feed_clhelper_.DefaultWorkSize(*(op_in_cl_image)); - - DLOG << "op_in_cl_image: " << *op_in_cl_image; - DLOG << "default_work_size: " << default_work_size; - cl_int status; - int numel = input->numel(); - cl_mem output_image = op_in_cl_image->GetCLImage(); - const int out_C = op_in_cl_image->dims()[1]; - const int out_H = op_in_cl_image->dims()[2]; - const int out_W = op_in_cl_image->dims()[3]; - const int Stride2 = out_C * out_H * out_W; - const int Stride1 = out_H * out_W; - const int Stride0 = out_W; - framework::CLTensor input_cl_tensor(this->feed_clhelper_.CLContext(), - this->feed_clhelper_.CLCommandQueue()); - input_cl_tensor.Resize(input->dims()); - cl_mem inputBuffer; - - inputBuffer = - input_cl_tensor.mutable_with_data(input->data()); - - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->feed_clhelper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - - CL_CHECK_ERRORS(status); - - DLOG << "*op_in_cl_image: " << *op_in_cl_image; - } - - void Fetch(DDim fetch_dims) { - DLOG << "------------------ Fetch op ---------------------"; - - DLOG << "------------------ Fetch op end ---------------------"; - } - - private: - std::shared_ptr scope_; - framework::CLHelper feed_clhelper_; - framework::CLHelper fetch_clhelper_; - - Variable *feed_var; - Variable *fetch_var; - Variable *op_in_var; - Variable *op_out_var; -}; -} // namespace paddle_mobile -#endif diff --git a/mobile/test/fpga/test_concat_op.cpp b/mobile/test/fpga/test_concat_op.cpp deleted file mode 100644 index 44b9f4971b..0000000000 --- a/mobile/test/fpga/test_concat_op.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/concat_op.h" - -int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(g_googlenet); - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); - - Executor4Test> - executor(program, "concat"); - - // 1. input_tensors; - vector input_tensors; - - Tensor input1; - auto input1_data = CreateInput(&input1, {4, 10, 2, 2}, 0, 1); - input_tensors.push_back(input1); - Tensor input2; - auto input2_data = CreateInput(&input2, {4, 20, 2, 2}, 0, 1); - input_tensors.push_back(input2); - Tensor input3; - auto input3_data = CreateInput(&input3, {4, 30, 2, 2}, 0, 1); - input_tensors.push_back(input3); - Tensor input4; - auto input4_data = CreateInput(&input4, {4, 40, 2, 2}, 0, 1); - input_tensors.push_back(input4); - // 2. input_names - vector input_names({ - "conv2d_3.tmp_1", - "conv2d_5.tmp_1", - "conv2d_7.tmp_1", - "conv2d_8.tmp_1", - }); - - // 3. output_names - vector output_names({"concat_0.tmp_0"}); - - // 4. out_dims; - vector out_ddims; - auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2}); - out_ddims.push_back(out_ddim); - - auto output = executor.Predict(input_tensors, input_names, - output_names, out_ddims); - - auto output0_data = output[0]->data(); - - // 5. test one example. - int input_n = 1; - int input_c = 2; - int input_h = 0; - int input_w = 1; - int stride0 = input3.numel() / input3.dims()[0]; - int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1]; - int stride2 = input3.dims()[3]; - /// inputx1 (4,10,2,2), - /// inputx2 (4,20,2,2), - /// inputx3 (4,30,2,2), - /// inputx4 (4,40,2,2), - /// axis = 1 - /// output (4,100,2,2) - int input_index = - input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w; - int output_index = input_n * 100 * 2 * 2 + - (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 + - input_h * 2 + input_w; - - DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index]; - DLOG << " output [1,32,0,1] = " << output0_data[output_index]; - return 0; -} diff --git a/mobile/test/fpga/test_densebox_combine.cpp b/mobile/test/fpga/test_densebox_combine.cpp deleted file mode 100644 index 056bbe52d8..0000000000 --- a/mobile/test/fpga/test_densebox_combine.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -static const char *g_densebox_combine = "../models/densebox"; -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - // paddle_mobile.SetThreadNum(4); - if (paddle_mobile.Load(std::string(g_densebox_combine) + "/model", - std::string(g_densebox_combine) + "/params", true)) { - // std::vector input; - // std::vector dims{1, 3, 512, 1024}; - // GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - // auto vec_result = paddle_mobile.Predict(input, dims); - - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 512, 1024}, static_cast(0), - static_cast(1)); - // readStream(g_image_src_float, - // input_tensor.mutable_data({1, 3, 224, 224})); - paddle_mobile.FeedData(input_tensor); - paddle_mobile.Predict_To(-1); - } - - return 0; -} diff --git a/mobile/test/fpga/test_format_data.cpp b/mobile/test/fpga/test_format_data.cpp deleted file mode 100644 index 1d67c3110f..0000000000 --- a/mobile/test/fpga/test_format_data.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" -#include "fpga/api.h" - -namespace frame = paddle_mobile::framework; -namespace fpga = paddle_mobile::fpga; -using std::cout; -using std::endl; - -void test_format_image() { - std::vector dims{1, 1, 3, 3}; - std::vector elements{1, 2, 3, 4, 5, 6, 7, 8, 9}; - frame::DDim ddim = frame::make_ddim(dims); - frame::Tensor image(elements, ddim); - int num = image.numel(); - float *data_ptr = image.mutable_data(); - - for (int i = 0; i < num; i++) { - cout << data_ptr[i] << " "; - } - cout << endl; - - fpga::format_image(&image); - data_ptr = image.mutable_data(); - - for (int i = 0; i < 48; i++) { - cout << data_ptr[i] << " "; - } - cout << endl; - auto dd = image.dims(); - cout << dims[0] << dims[1] << dims[2] << dims[3] << endl; -} - -void test_fill_conv_arg() { - Tensor input, out, filter; - DLOG << "Setup input"; - SetupTensor(&input, {1, 250, 32, 30}, static_cast(0), - static_cast(1)); - - DLOG << "Setup filter"; - SetupTensor(&filter, {1001, 250, 3, 3}, static_cast(0), - static_cast(1)); - - DLOG << "Setup output"; - SetupTensor(&out, {1, 1001, 32, 30}, static_cast(0), - static_cast(1)); - auto bs_ptr = (float *)fpga::fpga_malloc(2 * 1001 * sizeof(float)); - - DLOG << "find max"; - float max_value = fpga::filter_find_max(&filter); - DLOG << "format filter"; - fpga::format_filter(&filter, max_value, 1); - - DLOG << "format bs_ptr"; - int element_num_per_div = fpga::get_filter_num_per_div(&filter, 1); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001); - - DLOG << "format ofm"; - fpga::format_fp16_ofm(&out); - DLOG << "Build arg"; - - fpga::WrapperConvArgs arg; - fpga::fill_conv_arg(&arg, &input, &out, &filter, true, 1, 1, 1, 1, 1, bs_ptr); - DLOG << "splitNum: " << arg.split_num << " group_num:" << arg.group_num - << " filter_num:" << arg.filter_num; - - for (int i = 0; i < arg.split_num; i++) { - DLOG << arg.conv_args[i].filter_num << " " << arg.conv_args[i].sb_address - << " " << arg.conv_args[i].filter_address << " " - << arg.conv_args[i].filter_scale_address; - } -} - -int main() { - test_format_image(); - test_fill_conv_arg(); - return 0; -} diff --git a/mobile/test/fpga/test_marker.cpp b/mobile/test/fpga/test_marker.cpp deleted file mode 100644 index e0977b57f0..0000000000 --- a/mobile/test/fpga/test_marker.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_MOBILE_FPGA -#define PADDLE_MOBILE_FPGA -#endif - -#include "../test_helper.h" -#include "../test_include.h" -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -#include -#include -#include "../../src/io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT -using namespace paddle_mobile::fpga; // NOLINT - -static const char *g_image = "../models/marker/marker1/image.bin"; -static const char *g_model = "../models/marker/marker1/model"; -static const char *g_param = "../models/marker/marker1/params"; - -void readStream(std::string filename, char *buf) { - std::ifstream in; - in.open(filename, std::ios::in | std::ios::binary); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - - in.seekg(0, std::ios::end); // go to the end - auto length = in.tellg(); // report location (this is the length) - in.seekg(0, std::ios::beg); // go back to the beginning - in.read(buf, length); - in.close(); -} - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kFPGA; - config.prog_file = g_model; - config.param_file = g_param; - config.thread_num = 1; - config.batch_size = 1; - config.optimize = true; - config.lod_mode = true; - config.quantification = false; - return config; -} - -int main() { - open_device(); - - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - std::cout << "Finishing loading model" << std::endl; - - float img_info[3] = {432, 1280, 1.0f}; - int img_length = 432 * 1280 * 3; - auto img = reinterpret_cast(fpga_malloc(img_length * sizeof(float))); - readStream(g_image, reinterpret_cast(img)); - - std::cout << "Finishing initializing data" << std::endl; - struct PaddleTensor t_img_info, t_img; - t_img.dtypeid = typeid(float); - t_img_info.layout = LAYOUT_HWC; - t_img_info.shape = std::vector({1, 3}); - t_img_info.name = "Image information"; - t_img_info.data.Reset(img_info, 3 * sizeof(float)); - - t_img.dtypeid = typeid(float); - t_img.layout = LAYOUT_HWC; - t_img.shape = std::vector({1, 432, 1280, 3}); - t_img.name = "Image information"; - t_img.data.Reset(img, img_length * sizeof(float)); - predictor->FeedPaddleTensors({t_img_info, t_img}); - - std::cout << "Finishing feeding data " << std::endl; - - predictor->Predict_From_To(0, -1); - std::cout << "Finishing predicting " << std::endl; - - std::vector v; // No need to initialize v - predictor->FetchPaddleTensors(&v); // Old data in v will be cleared - for (int i = 0; i < v.size(); ++i) { - auto p = reinterpret_cast(v[i].data.data()); - int len = v[i].data.length(); - float result = 0.0f; - std::string str = "fetch" + std::to_string(i); - fpga::savefile(str, p, len, result); - } - - std::cout << "Finish getting vector values" << std::endl; - - //////////////////////////////////////////////////// - - // PaddleTensor tensor; - // predictor->GetPaddleTensor("fetch2", &tensor); - // for (int i = 0; i < post_nms; i++) { - // auto p = reinterpret_cast(tensor.data.data()); - // std::cout << p[+i] << std::endl; - // } - - return 0; -} diff --git a/mobile/test/fpga/test_marker2.cpp b/mobile/test/fpga/test_marker2.cpp deleted file mode 100644 index b4af515c73..0000000000 --- a/mobile/test/fpga/test_marker2.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif -#include -#ifdef COST_TIME_PRINT -#include -#include -#include -#endif -void readStream(std::string filename, char *buf) { - std::ifstream in; - in.open(filename, std::ios::in | std::ios::binary); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - - in.seekg(0, std::ios::end); // go to the end - auto length = in.tellg(); // report location (this is the length) - in.seekg(0, std::ios::beg); // go back to the beginning - in.read(buf, length); - DLOG << length; - in.close(); -} - -void convert_to_chw(int16_t **data_in, int channel, int height, int width, - int num, int16_t *data_tmp) { - int64_t amount_per_side = width * height; - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + n * amount_per_side * channel + c * amount_per_side + - width * h + w) = *((*data_in)++); - } - } - } - } -} - -void dump_stride_half(std::string filename, Tensor input_tensor, - const int dumpnum, bool use_chw) { - // bool use_chw = true; - if (input_tensor.dims().size() != 4) return; - int c = (input_tensor.dims())[1]; - int h = (input_tensor.dims())[2]; - int w = (input_tensor.dims())[3]; - int n = (input_tensor.dims())[0]; - auto data_ptr = input_tensor.get_data(); - auto *data_ptr_16 = reinterpret_cast(data_ptr); - auto data_tmp = data_ptr_16; - if (use_chw) { - data_tmp = - reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); - convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp); - } - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); - out << result << std::endl; - } - out.close(); - if (data_tmp != data_ptr_16) { - free(data_tmp); - } -} - -void dump_stride_float(std::string filename, Tensor input_tensor, - const int dumpnum) { - auto data_ptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = data_ptr[i]; - out << result << std::endl; - } - out.close(); -} - -void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum, - bool use_chw) { - static int i = 0; - if (input_tensor.numel() == 0) { - return; - } - if (input_tensor.type() == typeid(float)) { - DLOG << "op: " << i++ << ", float data " << input_tensor.numel(); - dump_stride_float(filename, input_tensor, dumpnum); - } else { - DLOG << "op: " << i++ << ", half data " << input_tensor.numel(); - dump_stride_half(filename, input_tensor, dumpnum, use_chw); - } - DLOG << "dump input address: " << input_tensor.get_data(); -} - -static const char *g_marker_combine = "../models/marker/marker_2segment"; -// static const char *g_marker_combine = "../models/marker/model2"; -static const char *g_image_src_float = - "../models/marker/marker_2segment/marker_2.bin"; -// static const char *g_image_src_float = "../models/marker/model2/data.bin"; -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - - if (paddle_mobile.Load(std::string(g_marker_combine) + "/model", - std::string(g_marker_combine) + "/params", true, false, - 1, true)) { - // if (paddle_mobile.Load(std::string(g_marker_combine), true)) { - float img_info[3] = {432, 1280, 1.0f}; - auto img = reinterpret_cast( - fpga::fpga_malloc(144 * 14 * 14 * sizeof(float))); - readStream(g_image_src_float, reinterpret_cast(img)); - - std::vector v(3, nullptr); - paddle_mobile.FeedData({img}); - // paddle_mobile.Predict_To(-1); -#ifdef COST_TIME_PRINT - timeval start11, end11; - long dif_sec, dif_usec; // NOLINT -#endif - -#ifdef COST_TIME_PRINT - gettimeofday(&start11, NULL); -#endif - - paddle_mobile.Predict_To(-1); - -#ifdef COST_TIME_PRINT - gettimeofday(&end11, NULL); - dif_sec = end11.tv_sec - start11.tv_sec; - dif_usec = end11.tv_usec - start11.tv_usec; - std::cout << "total: " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << " us" - << std::endl; -#endif - - for (int i = 0; i < 8; i++) { - auto tensor_ptr = paddle_mobile.FetchResult(i); - std::string saveName = "marker_" + std::to_string(i); - // if(i != 58) - paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), - tensor_ptr->numel() * sizeof(float)); - // tensor_ptr->numel() * sizeof(float)); - - dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), - true); // 20);//tensor_ptr->numel()); - } - - // paddle_mobile.GetResults(&v); - DLOG << "Computation done"; - fpga::fpga_free(img); - } - - return 0; -} diff --git a/mobile/test/fpga/test_marker_api.cpp b/mobile/test/fpga/test_marker_api.cpp deleted file mode 100644 index 19e051a38d..0000000000 --- a/mobile/test/fpga/test_marker_api.cpp +++ /dev/null @@ -1,241 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_MOBILE_FPGA -#define PADDLE_MOBILE_FPGA -#endif -#include -#include -#include -#include -#include -#include "../../src/io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT -using namespace paddle_mobile::fpga; // NOLINT - -static const char *g_image = "../models/marker/model/image.bin"; -static const char *g_model = "../models/marker/model/model"; -static const char *g_param = "../models/marker/model/params"; - -static const char *g_image1 = "../models/marker2/model/marker.bin"; -static const char *g_model1 = "../models/marker2/model/model"; -static const char *g_param1 = "../models/marker2/model/params"; - -void readStream(std::string filename, char *buf) { - std::ifstream in; - in.open(filename, std::ios::in | std::ios::binary); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - - in.seekg(0, std::ios::end); // go to the end - auto length = in.tellg(); // report location (this is the length) - in.seekg(0, std::ios::beg); // go back to the beginning - in.read(buf, length); - in.close(); -} -signed char float_to_int8(float fdata) { - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - return (signed char)fdata; -} -void quantize(float **data_in, int data_size) { - float *tmp = *data_in; - signed char *tmp_data = - (signed char *)paddle_mobile::fpga::fpga_malloc(data_size * sizeof(char)); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = float_to_int8((*data_in)[i] + 128); - } - *data_in = (float *)tmp_data; // NOLINT - paddle_mobile::fpga::fpga_free(tmp); -} - -void convert_to_chw(float **data_in, int channel, int height, int width, - float *data_tmp) { - int64_t amount_per_side = width * height; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++); - } - } - } -} - -void dump_stride_float(std::string filename, - paddle_mobile::PaddleTensor input_tensor) { - auto data_ptr = reinterpret_cast(input_tensor.data.data()); - int c = (input_tensor.shape)[1]; - int h = (input_tensor.shape)[2]; - int w = (input_tensor.shape)[3]; - int n = (input_tensor.shape)[0]; - float *data_tmp = - reinterpret_cast(malloc(c * h * w * sizeof(float))); - // convert_to_chw(&data_ptr, c, h, w, data_tmp); - std::ofstream out(filename.c_str()); - float result = 0; - int datasize = abs(c * h * w * n); - if (datasize == 0) { - std::cout << "wrong dump data size" << std::endl; - return; - } - for (int i = 0; i < datasize; i++) { - result = data_ptr[i]; - out << result << std::endl; - } - out.close(); -} - -void dump_stride(std::string filename, - paddle_mobile::PaddleTensor input_tensor) { - if (input_tensor.dtypeid == PaddlekTypeId_t::paddle_float) { - dump_stride_float(filename, input_tensor); - } else { - std::cout << "only support dumping float data" << std::endl; - } -} -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kFPGA; - config.prog_file = g_model; - config.param_file = g_param; - config.thread_num = 1; - config.batch_size = 1; - config.optimize = true; - config.lod_mode = true; - config.quantification = false; - return config; -} -PaddleMobileConfig GetConfig1() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kFPGA; - config.prog_file = g_model1; - config.param_file = g_param1; - config.thread_num = 1; - config.batch_size = 1; - config.optimize = true; - config.lod_mode = true; - config.quantification = false; - return config; -} - -int main() { - open_device(); - timeval start11, end11; - long dif_sec, dif_usec; // NOLINT - - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - std::cout << "Finishing loading model" << std::endl; - - float img_info[3] = {432, 1280, 1.0f}; - int img_length = 432 * 1280 * 3; - auto img = reinterpret_cast(fpga_malloc(img_length * sizeof(float))); - readStream(g_image, reinterpret_cast(img)); - - std::cout << "Finishing initializing data" << std::endl; - struct PaddleTensor t_img_info, t_img; - t_img_info.dtypeid = PaddlekTypeId_t::paddle_float; - t_img_info.layout = LAYOUT_HWC; - t_img_info.shape = std::vector({1, 3}); - t_img_info.name = "Image information"; - t_img_info.data.Reset(img_info, 3 * sizeof(float)); - - t_img.dtypeid = PaddlekTypeId_t::paddle_float; - // quantize(&img, img_length); - // t_img.dtypeid = typeid(int8_t); - t_img.layout = LAYOUT_HWC; - t_img.shape = std::vector({1, 432, 1280, 3}); - t_img.name = "Image information"; - t_img.data.Reset(img, img_length * sizeof(float)); - // t_img.data.Reset(img, img_length * sizeof(int8_t)); - // for(int i = 0; i < 100; ++i){ - predictor->FeedPaddleTensors({t_img_info, t_img}); - - std::cout << "Finishing feeding data " << std::endl; - - gettimeofday(&start11, NULL); - predictor->Predict_From_To(0, -1); - gettimeofday(&end11, NULL); - dif_sec = end11.tv_sec - start11.tv_sec; - dif_usec = end11.tv_usec - start11.tv_usec; - std::cout << "marker1 total" - << " cost time: " << (dif_sec * 1000000 + dif_usec) << " us" - << std::endl; - std::cout << "Finishing predicting " << std::endl; - - std::vector v; // No need to initialize v - predictor->FetchPaddleTensors(&v); // Old data in v will be cleared - std::cout << "Output number is " << v.size() << std::endl; - for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) { - std::string dumpName = "marker_api_fetch_" + std::to_string(fetchNum); - // dump_stride(dumpName, v[fetchNum]); - } - fpga_free(img); - - PaddleMobileConfig config1 = GetConfig1(); - auto predictor1 = - CreatePaddlePredictor(config1); - - std::cout << "Finishing loading model" << std::endl; - for (int i = 0; i < 1; ++i) { - int img_length1 = 144 * 14 * 14; - auto img1 = - reinterpret_cast(fpga_malloc(img_length1 * sizeof(float))); - readStream(g_image1, reinterpret_cast(img1)); - - std::cout << "Finishing initializing data" << std::endl; - struct PaddleTensor t_img1; - - t_img1.dtypeid = PaddlekTypeId_t::paddle_float; - t_img1.layout = LAYOUT_HWC; - t_img1.shape = std::vector({1, 14, 14, 144}); - t_img1.name = "Image information"; - t_img1.data.Reset(img1, img_length1 * sizeof(float)); - predictor1->FeedPaddleTensors({t_img1}); - - std::cout << "Finishing feeding data " << std::endl; - - gettimeofday(&start11, NULL); - predictor1->Predict_From_To(0, -1); - gettimeofday(&end11, NULL); - dif_sec = end11.tv_sec - start11.tv_sec; - dif_usec = end11.tv_usec - start11.tv_usec; - std::cout << "marker2 total" - << " cost time: " << (dif_sec * 1000000 + dif_usec) << " us" - << std::endl; - std::cout << "Finishing predicting " << std::endl; - - std::vector v1; // No need to initialize v - predictor1->FetchPaddleTensors(&v1); // Old data in v will be cleared - std::cout << "Output number is " << v1.size() << std::endl; - for (int fetchNum = 0; fetchNum < v1.size(); fetchNum++) { - std::string dumpName = "marker2_api_fetch_" + std::to_string(fetchNum); - dump_stride(dumpName, v1[fetchNum]); - } - fpga_free(img1); - } - return 0; -} diff --git a/mobile/test/fpga/test_mobilenet_api.cpp b/mobile/test/fpga/test_mobilenet_api.cpp deleted file mode 100644 index 5c0a594ca8..0000000000 --- a/mobile/test/fpga/test_mobilenet_api.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_MOBILE_FPGA -#define PADDLE_MOBILE_FPGA -#endif -#include -#include -#include "../../src/io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT -using namespace paddle_mobile::fpga; // NOLINT - -static const char *g_image = "../images/mobilenet_txtdata/1.txt"; -static const char *g_model = "../models/keycurve_l2_regular4_model/__model__"; -static const char *g_param = - "../models/keycurve_l2_regular4_model/model.params"; - -void readStream(std::string filename, float *buf) { - std::ifstream in; - in.open(filename, std::ios::in); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - int i = 0; - while (!in.eof()) { - in >> buf[i]; - i++; - } - in.close(); -} - -signed char float_to_int8(float fdata) { - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - return (signed char)fdata; -} -void quantize(float **data_in, int data_size) { - float *tmp = *data_in; - signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = float_to_int8((*data_in)[i] + 128); - } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} - -void convert_to_chw(float **data_in, int channel, int height, int width, - float *data_tmp) { - int64_t amount_per_side = width * height; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++); - } - } - } -} - -void dump_stride_float(std::string filename, PaddleTensor input_tensor) { - auto data_ptr = reinterpret_cast(input_tensor.data.data()); - int c = (input_tensor.shape)[1]; - int h = (input_tensor.shape)[2]; - int w = (input_tensor.shape)[3]; - int n = (input_tensor.shape)[0]; - float *data_tmp = - reinterpret_cast(malloc(c * h * w * sizeof(float))); - convert_to_chw(&data_ptr, c, h, w, data_tmp); - std::ofstream out(filename.c_str()); - float result = 0; - int datasize = abs(c * h * w * n); - if (datasize == 0) { - std::cout << "wrong dump data size" << std::endl; - return; - } - for (int i = 0; i < datasize; i++) { - result = data_tmp[i]; - out << result << std::endl; - } - out.close(); -} - -void dump_stride(std::string filename, PaddleTensor input_tensor) { - if (input_tensor.dtypeid == PaddlekTypeId_t::paddle_float) { - dump_stride_float(filename, input_tensor); - } else { - std::cout << "only support dumping float data" << std::endl; - } -} - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kFPGA; - config.prog_file = g_model; - config.param_file = g_param; - config.thread_num = 1; - config.batch_size = 1; - config.optimize = true; - config.lod_mode = true; - config.quantification = false; - return config; -} -int main() { - open_device(); - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - std::cout << "Finishing loading model" << std::endl; - int img_length = 256 * 416 * 3; - auto img = reinterpret_cast(fpga_malloc(img_length * sizeof(float))); - readStream(g_image, img); - - std::cout << "Finishing initializing data" << std::endl; - struct PaddleTensor t_img; - t_img.dtype = FLOAT32; - t_img.dtypeid = PaddlekTypeId_t::paddle_float; - // quantize(&img, img_length); - // t_img.dtype = INT8; - // t_img.dtypeid = typeid(int8_t); - t_img.layout = LAYOUT_HWC; - t_img.shape = std::vector({1, 256, 416, 3}); - t_img.name = "Image information"; - t_img.data.Reset(img, img_length * sizeof(float)); - // t_img.data.Reset(img, img_length * sizeof(int8_t)); - predictor->FeedPaddleTensors({t_img}); - - std::cout << "Finishing feeding data " << std::endl; - - predictor->Predict_From_To(0, -1); - std::cout << "Finishing predicting " << std::endl; - - std::vector v; // No need to initialize v - predictor->FetchPaddleTensors(&v); // Old data in v will be cleared - std::cout << "Output number is " << v.size() << std::endl; - for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) { - std::string dumpName = "mobilenet_api_fetch_" + std::to_string(fetchNum); - dump_stride(dumpName, v[fetchNum]); - } - return 0; -} diff --git a/mobile/test/fpga/test_pe.cpp b/mobile/test/fpga/test_pe.cpp deleted file mode 100644 index f5f2708b9e..0000000000 --- a/mobile/test/fpga/test_pe.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#include "fpga/V2/filter.h" - -namespace fpga = paddle_mobile::fpga; - -static const uint32_t N = 64; -static const uint32_t C = 3; -static const uint32_t H = 224; -static const uint32_t W = 224; -static const uint32_t G = 1; - -fpga::DataType input_type = fpga::DATA_TYPE_FP32; -fpga::DataType output_type = fpga::DATA_TYPE_FP16; - -void* ifm = nullptr; -void* ofm = nullptr; -void* filter = nullptr; -void* ifm_scale = nullptr; -void* ofm_scale = nullptr; -void* filter_scale = nullptr; - -int ifm_size = 0, ofm_size = 0; - -void format_data() { - ifm_scale = fpga::fpga_malloc(8); - ofm_scale = fpga::fpga_malloc(8); - int ifm_channel = fpga::filter::calc_aligned_channel(C); - int ofm_channel = fpga::filter::calc_aligned_channel(N); - int num = fpga::filter::calc_aligned_num(N, C); - DLOG << "ifm_channel = " << ifm_channel; - DLOG << "ofm_channel = " << ofm_channel; - DLOG << "aligned_num = " << num; - ifm_size = ifm_channel * H * W; - ofm_size = ofm_channel * H * W; - ifm = fpga::fpga_malloc(ifm_size * sizeof(float)); - ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t)); - memset(ifm, 0, ifm_size * sizeof(float)); - memset(ofm, 0, ofm_size * sizeof(int16_t)); - - for (int h = 0; h < H; h++) { - for (int w = 0; w < W; w++) { - for (int c = 0; c < C; c++) { - int index = h * W * ifm_channel + w * ifm_channel + c; - (reinterpret_cast(ifm))[index] = h + w + c * 0.1f; - // DLOG << index << ":" << ((float *) ifm)[index]; - } - } - } - fpga::fpga_flush(ifm, ifm_size * sizeof(float)); - fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t)); -} - -void print_fp16(int16_t* ptr, int total_size, int num) { - fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t)); - int stride = total_size / num; - for (int i = 0; i < total_size; i += stride) { - DLOG << fpga::fp16_2_fp32(ptr[i]); - } -} - -void print_fp32(float* ptr, int total_size, int num) { - fpga::fpga_invalidate(ptr, total_size * sizeof(float)); - int stride = total_size / num; - for (int i = 0; i < total_size; i += stride) { - DLOG << ptr[i]; - } -} - -void test_bypass() { - fpga::BypassArgs args; - args.input_data_type = input_type; - args.output_data_type = output_type; - args.image.address = ifm; - args.image.height = H; - args.image.width = W; - args.image.channels = C; - args.image.scale_address = reinterpret_cast(ifm_scale); - args.output.address = ofm; - args.output.scale_address = reinterpret_cast(ofm_scale); - fpga::PerformBypass(args); -} - -int main() { - paddle_mobile::fpga::open_device(); - format_data(); - DLOG << "format data done"; - print_fp32(reinterpret_cast(ifm), ifm_size, 200); - DLOG << "print input done"; - test_bypass(); - DLOG << "test done"; - print_fp16(reinterpret_cast(ofm), ifm_size, 200); - std::cout << "Computation done" << std::endl; - return 0; -} - -#endif diff --git a/mobile/test/fpga/test_resnet50.cpp b/mobile/test/fpga/test_resnet50.cpp deleted file mode 100644 index e48ad33f36..0000000000 --- a/mobile/test/fpga/test_resnet50.cpp +++ /dev/null @@ -1,140 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include "../test_include.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -void readStream(std::string filename, float *buf) { - std::ifstream in; - in.open(filename, std::ios::in); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - string strOne; - int i = 0; - while (!in.eof()) { - in >> buf[i]; - i++; - } - in.close(); -} - -void convert_to_chw(int16_t **data_in, int channel, int height, int width, - int16_t *data_tmp) { - int64_t amount_per_side = width * height; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++); - } - } - } -} - -void dump(std::string filename, Tensor input_tensor) { - auto dataptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - for (int i = 0; i < input_tensor.numel(); ++i) { - result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]); - out << result << std::endl; - } - out.close(); -} -void dump_stride_half(std::string filename, Tensor input_tensor, - const int dumpnum) { - int c = (input_tensor.dims())[1]; - int h = (input_tensor.dims())[2]; - int w = (input_tensor.dims())[3]; - auto data_ptr = input_tensor.get_data(); - auto *data_tmp = - reinterpret_cast(malloc(c * h * w * sizeof(int16_t))); - auto *data_ptr_16 = reinterpret_cast(data_ptr); - convert_to_chw(&data_ptr_16, c, h, w, data_tmp); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); - out << result << std::endl; - } - out.close(); - free(data_tmp); -} - -void dump_stride_float(std::string filename, Tensor input_tensor, - const int dumpnum) { - auto data_ptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = data_ptr[i]; - out << result << std::endl; - } - out.close(); -} -static const char *g_resnet50 = "../models/resnet50"; -const std::string g_image_src_float = "../images/image_src_float"; // NOLINT -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - if (paddle_mobile.Load(std::string(g_resnet50), true)) { - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(2), - static_cast(2)); - readStream(g_image_src_float, - input_tensor.mutable_data({1, 3, 224, 224})); - paddle_mobile.FeedData(input_tensor); - paddle_mobile.Predict_To(-1); - for (int i = 0; i < 73; i++) { - auto tensor_ptr = paddle_mobile.FetchResult(i); - std::string saveName = "resnet50_result_" + std::to_string(i); - paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), - tensor_ptr->numel() * sizeof(half)); - // dump_stride_half(saveName, (*tensor_ptr), 20); - // dump(saveName, (*tensor_ptr)); - } - - auto tensor_ptr = paddle_mobile.FetchResult(73); - // dump_stride_float("resnet50_result_73", (*tensor_ptr), 20); - tensor_ptr = paddle_mobile.FetchResult(74); - // dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999); - - float max = 0; - auto data_ptr = tensor_ptr->data(); - int maximumIdx = 0; - for (int i = 0; i < (*tensor_ptr).numel(); i++) { - if (data_ptr[i] > max) { - maximumIdx = i; - max = data_ptr[i]; - } - } - std::cout << "index : " << std::dec << maximumIdx << ", value : " << max - << std::endl; - std::cout << "Computation done" << std::endl; - return 0; - } -} diff --git a/mobile/test/fpga/test_rfcn.cpp b/mobile/test/fpga/test_rfcn.cpp deleted file mode 100644 index 50f8aa863d..0000000000 --- a/mobile/test/fpga/test_rfcn.cpp +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -#include - -void readStream(std::string filename, char *buf) { - std::ifstream in; - in.open(filename, std::ios::in | std::ios::binary); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - - in.seekg(0, std::ios::end); // go to the end - auto length = in.tellg(); // report location (this is the length) - in.seekg(0, std::ios::beg); // go back to the beginning - in.read(buf, length); - DLOG << length; - in.close(); -} - -void convert_to_chw(int16_t **data_in, int channel, int height, int width, - int num, int16_t *data_tmp) { - int64_t amount_per_side = width * height; - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + n * amount_per_side * channel + c * amount_per_side + - width * h + w) = *((*data_in)++); - } - } - } - } -} - -void dump_stride_half(std::string filename, Tensor input_tensor, - const int dumpnum, bool use_chw) { - // bool use_chw = true; - if (input_tensor.dims().size() != 4) return; - int c = (input_tensor.dims())[1]; - int h = (input_tensor.dims())[2]; - int w = (input_tensor.dims())[3]; - int n = (input_tensor.dims())[0]; - auto data_ptr = input_tensor.get_data(); - auto *data_ptr_16 = reinterpret_cast(data_ptr); - auto data_tmp = data_ptr_16; - if (use_chw) { - data_tmp = - reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); - convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp); - } - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); - out << result << std::endl; - } - out.close(); - if (data_tmp != data_ptr_16) { - free(data_tmp); - } -} - -void dump_stride_float(std::string filename, Tensor input_tensor, - const int dumpnum) { - auto data_ptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = data_ptr[i]; - out << result << std::endl; - } - out.close(); -} - -void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum, - bool use_chw) { - static int i = 0; - if (input_tensor.numel() == 0) { - return; - } - if (input_tensor.type() == typeid(float)) { - DLOG << "op: " << i++ << ", float data " << input_tensor.numel(); - - dump_stride_float(filename, input_tensor, dumpnum); - } else { - DLOG << "op: " << i++ << ", half data " << input_tensor.numel(); - - dump_stride_half(filename, input_tensor, dumpnum, use_chw); - } - DLOG << "dump input address: " << input_tensor.get_data(); -} - -static const char *g_rfcn_combine = "../models/rfcn"; -static const char *g_image_src_float = "../models/rfcn/data.bin"; -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - - if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", - std::string(g_rfcn_combine) + "/params", true, false, - 1, true)) { - float img_info[3] = {768, 1536, 768.0f / 960.0f}; - auto img = reinterpret_cast( - fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float))); - readStream(g_image_src_float, reinterpret_cast(img)); - - std::vector v(3, nullptr); - paddle_mobile.FeedData(std::vector({img_info, img})); - paddle_mobile.Predict_To(-1); - - for (int i = 65; i < 69; i++) { - auto tensor_ptr = paddle_mobile.FetchResult(i); - std::string saveName = "rfcn_" + std::to_string(i); - paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), - tensor_ptr->numel() * sizeof(float)); - dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), true); - } - // paddle_mobile.GetResults(&v); - DLOG << "Computation done"; - fpga::fpga_free(img); - } - - return 0; -} diff --git a/mobile/test/fpga/test_rfcn_api.cpp b/mobile/test/fpga/test_rfcn_api.cpp deleted file mode 100644 index b8b031bf59..0000000000 --- a/mobile/test/fpga/test_rfcn_api.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_MOBILE_FPGA -#define PADDLE_MOBILE_FPGA -#endif -#include -#include -#include -#include "../../src/io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT -using namespace paddle_mobile::fpga; // NOLINT - -static const char *g_image = "../models/rfcn/data.bin"; -static const char *g_model = "../models/rfcn/model"; -static const char *g_param = "../models/rfcn/params"; - -void readStream(std::string filename, char *buf) { - std::ifstream in; - in.open(filename, std::ios::in | std::ios::binary); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - - in.seekg(0, std::ios::end); // go to the end - auto length = in.tellg(); // report location (this is the length) - in.seekg(0, std::ios::beg); // go back to the beginning - in.read(buf, length); - in.close(); -} - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kFPGA; - config.prog_file = g_model; - config.param_file = g_param; - config.thread_num = 1; - config.batch_size = 1; - config.optimize = true; - config.lod_mode = true; - config.quantification = false; - return config; -} - -PaddleMobileConfig GetConfig1() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kFPGA; - config.model_dir = "../models/resnet50"; - config.thread_num = 1; - config.batch_size = 1; - config.optimize = true; - config.quantification = false; - return config; -} - -int main() { - open_device(); -#if 0 - PaddleMobileConfig config1 = GetConfig1(); - auto predictor1 = - CreatePaddlePredictor(config1); - - std::cout << "Finishing loading model" << std::endl; - - int img_length1 = 224 * 224 * 3; - auto img1 = - reinterpret_cast(fpga_malloc(img_length1 * sizeof(float))); - - std::cout << "Finishing initializing data" << std::endl; - - struct PaddleTensor t_img1; - - t_img1.dtypeid = type_id().hash_code(); - t_img1.layout = LAYOUT_HWC; - t_img1.shape = std::vector({1, 224, 224, 3}); - t_img1.name = "Image information"; - t_img1.data.Reset(img1, img_length1 * sizeof(float)); - predictor1->FeedPaddleTensors({t_img1}); - predictor1->Predict_From_To(0, -1); - std::cout << "Finishing predicting " << std::endl; - - std::vector v1; // No need to initialize v - predictor1->FetchPaddleTensors(&v1); // Old data in v will be cleared - std::cout << "Output number is " << v1.size() << std::endl; - std::cout << "out[0] length " << v1[0].data.length() << std::endl; - fpga_free(img1); -#endif - //////////////////////////// - - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - std::cout << "Finishing loading model" << std::endl; - - float img_info[3] = {432, 1280, 1.0f}; - int img_length = 432 * 1280 * 3; - auto img = reinterpret_cast(fpga_malloc(img_length * sizeof(float))); - readStream(g_image, reinterpret_cast(img)); - - std::cout << "Finishing initializing data" << std::endl; - struct PaddleTensor t_img_info, t_img; - t_img.dtypeid = PaddlekTypeId_t::paddle_float; - t_img_info.layout = LAYOUT_HWC; - t_img_info.shape = std::vector({1, 3}); - t_img_info.name = "Image information"; - t_img_info.data.Reset(img_info, 3 * sizeof(float)); - - t_img.dtypeid = PaddlekTypeId_t::paddle_float; - t_img.layout = LAYOUT_HWC; - t_img.shape = std::vector({1, 432, 1280, 3}); - t_img.name = "Image information"; - t_img.data.Reset(img, img_length * sizeof(float)); - predictor->FeedPaddleTensors({t_img_info, t_img}); - - std::cout << "Finishing feeding data " << std::endl; - - predictor->Predict_From_To(0, -1); - std::cout << "Finishing predicting " << std::endl; - - std::vector v; // No need to initialize v - predictor->FetchPaddleTensors(&v); // Old data in v will be cleared - std::cout << "Output number is " << v.size() << std::endl; - std::cout << "out[0] length " << v[0].data.length() << std::endl; - std::cout << "out[1] length " << v[1].data.length() << std::endl; - std::cout << "out[2] length " << v[2].data.length() << std::endl; - - auto post_nms = v[0].data.length() / sizeof(float) / 8; - for (int num = 0; num < post_nms; num++) { - for (int i = 0; i < 8; i++) { - auto p = reinterpret_cast(v[0].data.data()); - std::cout << p[num * 8 + i] << std::endl; - } - } - for (int num = 0; num < post_nms; num++) { - for (int i = 0; i < 8; i++) { - auto p = reinterpret_cast(v[1].data.data()); - std::cout << p[num * 8 + i] << std::endl; - } - } - for (int num = 0; num < post_nms; num++) { - for (int i = 0; i < 4; i++) { - auto p = reinterpret_cast(v[2].data.data()); - std::cout << p[num * 4 + i] << std::endl; - } - } - std::cout << "Finish getting vector values" << std::endl; - fpga_free(img); - - auto version = fpga::paddle_mobile_version(); - - std::cout << "0X0" << std::hex << version << std::endl; - - return 0; -} diff --git a/mobile/test/fpga/test_ssd.cpp b/mobile/test/fpga/test_ssd.cpp deleted file mode 100644 index c6d2b51a8c..0000000000 --- a/mobile/test/fpga/test_ssd.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include "../test_include.h" - -#include "fpga/KD/float16.hpp" -#include "fpga/KD/llapi/zynqmp_api.h" - -static const char* g_ssd = "../models/resnet50"; - -int main() { - zynqmp::open_device(); - - paddle_mobile::PaddleMobile paddle_mobile; - std::string dir = std::string(g_ssd); - std::string model = std::string(g_ssd) + "/model"; - std::string params = std::string(g_ssd) + "/params"; - - // if (paddle_mobile.Load(dir, true)) { - if (paddle_mobile.Load(model, params, true)) { - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(1), - static_cast(1)); - float* data = input_tensor.mutable_data({1, 3, 224, 224}); - - paddle_mobile.Predict(input_tensor); - auto result_ptr = paddle_mobile.Fetch(); - float* result_data = result_ptr->data(); - } - return 0; -} diff --git a/mobile/test/fpga/test_tensor_quant.cpp b/mobile/test/fpga/test_tensor_quant.cpp deleted file mode 100644 index 6cfc27e91c..0000000000 --- a/mobile/test/fpga/test_tensor_quant.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - if (paddle_mobile.Load(g_resnet, true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - std::vector dims{1, 3, 32, 32}; - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 32, 32}, static_cast(0), - static_cast(1)); - - std::vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); - // 预热一次 - paddle_mobile.Predict(input, dims); - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) << "ms" - << std::endl; - } - - return 0; -} diff --git a/mobile/test/fpga/test_yolo_api.cpp b/mobile/test/fpga/test_yolo_api.cpp deleted file mode 100644 index 161d695418..0000000000 --- a/mobile/test/fpga/test_yolo_api.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_MOBILE_FPGA -#define PADDLE_MOBILE_FPGA -#endif -#include -#include -#include "../../src/io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT -using namespace paddle_mobile::fpga; // NOLINT - -static const char *g_image = "../images/yolo_test_txtimg/1.txt"; -static const char *g_model = "../models/yolo_bn_l2_model/__model__"; -static const char *g_param = "../models/yolo_bn_l2_model/model.params"; - -void readStream(std::string filename, float *buf) { - std::ifstream in; - in.open(filename, std::ios::in); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - int i = 0; - while (!in.eof()) { - in >> buf[i]; - i++; - } - in.close(); -} - -signed char float_to_int8(float fdata) { - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - return (signed char)fdata; -} -void quantize(float **data_in, int data_size) { - float *tmp = *data_in; - signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = float_to_int8((*data_in)[i] + 128); - } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} - -void convert_to_chw(float **data_in, int channel, int height, int width, - float *data_tmp) { - int64_t amount_per_side = width * height; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++); - } - } - } -} - -void dump_stride_float(std::string filename, PaddleTensor input_tensor) { - auto data_ptr = reinterpret_cast(input_tensor.data.data()); - int c = (input_tensor.shape)[1]; - int h = (input_tensor.shape)[2]; - int w = (input_tensor.shape)[3]; - int n = (input_tensor.shape)[0]; - float *data_tmp = - reinterpret_cast(malloc(c * h * w * sizeof(float))); - convert_to_chw(&data_ptr, c, h, w, data_tmp); - std::ofstream out(filename.c_str()); - float result = 0; - int datasize = abs(c * h * w * n); - if (datasize == 0) { - std::cout << "wrong dump data size" << std::endl; - return; - } - for (int i = 0; i < datasize; i++) { - result = data_tmp[i]; - out << result << std::endl; - } - out.close(); -} - -void dump_stride(std::string filename, PaddleTensor input_tensor) { - if (input_tensor.dtypeid == PaddlekTypeId_t::paddle_float) { - dump_stride_float(filename, input_tensor); - } else { - std::cout << "only support dumping float data" << std::endl; - } -} - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kFPGA; - config.prog_file = g_model; - config.param_file = g_param; - config.thread_num = 1; - config.batch_size = 1; - config.optimize = true; - config.lod_mode = true; - config.quantification = false; - return config; -} - -int main() { - open_device(); - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - std::cout << "Finishing loading model" << std::endl; - int img_length = 256 * 416 * 3; - auto img = reinterpret_cast(fpga_malloc(img_length * sizeof(float))); - readStream(g_image, img); - - std::cout << "Finishing initializing data" << std::endl; - struct PaddleTensor t_img; - // t_img.dtype = FLOAT32; - // t_img.dtypeid = type_id().hash_code(); - quantize(&img, img_length); - t_img.dtype = INT8; - t_img.dtypeid = PaddlekTypeId_t::paddle_int8_t; - t_img.layout = LAYOUT_HWC; - t_img.shape = std::vector({1, 256, 416, 3}); - t_img.name = "Image information"; - // t_img.data.Reset(img, img_length * sizeof(float)); - t_img.data.Reset(img, img_length * sizeof(int8_t)); - predictor->FeedPaddleTensors({t_img}); - - std::cout << "Finishing feeding data " << std::endl; - - predictor->Predict_From_To(0, -1); - std::cout << "Finishing predicting " << std::endl; - - std::vector v; // No need to initialize v - predictor->FetchPaddleTensors(&v); // Old data in v will be cleared - std::cout << "Output number is " << v.size() << std::endl; - for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) { - std::string dumpName = "yolo_api_fetch_" + std::to_string(fetchNum); - dump_stride(dumpName, v[fetchNum]); - } - return 0; -} diff --git a/mobile/test/framework/test_inference_api.cpp b/mobile/test/framework/test_inference_api.cpp deleted file mode 100644 index e1713bb203..0000000000 --- a/mobile/test/framework/test_inference_api.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "io/paddle_inference_api.h" - -using namespace paddle_mobile; - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kCPU; - config.model_dir = "../models/mobilenet/"; - config.thread_num = 4; - return config; -} - -int main() { - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - float data[1 * 3 * 224 * 224] = {1.0f}; - - PaddleTensor tensor; - tensor.shape = std::vector({1, 3, 224, 224}); - tensor.data = PaddleBuf(data, sizeof(data)); - tensor.dtype = PaddleDType::FLOAT32; - std::vector paddle_tensor_feeds(1, tensor); - - PaddleTensor tensor_out; - tensor_out.shape = std::vector({}); - tensor_out.data = PaddleBuf(); - tensor_out.dtype = PaddleDType::FLOAT32; - std::vector outputs(1, tensor_out); - - std::cout << " before predict " << std::endl; - - predictor->Run(paddle_tensor_feeds, &outputs); - - std::cout << " after predict " << std::endl; - // assert(); - - float* data_o = static_cast(outputs[0].data.data()); - for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) { - std::cout << "output[" << j << "]: " << data_o[j] << std::endl; - } - - return 0; -} diff --git a/mobile/test/framework/test_load.cpp b/mobile/test/framework/test_load.cpp deleted file mode 100644 index ed74b63497..0000000000 --- a/mobile/test/framework/test_load.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "../test_helper.h" -#include "framework/loader.h" - -int main() { - paddle_mobile::framework::Loader loader; - // ../../../test/models/googlenet - // ../../../test/models/mobilenet - - std::string g_super = "../models/superresoltion"; - // auto program = loader.Load(g_super, true); - - auto program = loader.Load(std::string(g_super) + "/model", - std::string(g_super) + "/params", false); - // program.originProgram->Description("program desc: "); - - return 0; -} diff --git a/mobile/test/framework/test_load_memory.cpp b/mobile/test/framework/test_load_memory.cpp deleted file mode 100644 index afab17d5e7..0000000000 --- a/mobile/test/framework/test_load_memory.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_helper.h" -#include "../test_include.h" - -static size_t ReadBuffer(const char *file_name, uint8_t **out) { - FILE *fp; - fp = fopen(file_name, "rb"); - PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name); - fseek(fp, 0, SEEK_END); - auto size = static_cast(ftell(fp)); - rewind(fp); - DLOG << "model size: " << size; - *out = reinterpret_cast(malloc(size)); - size_t cur_len = 0; - size_t nread; - while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) { - cur_len += nread; - } - fclose(fp); - return cur_len; -} - -static char *Get_binary_data(std::string filename) { - FILE *file = fopen(filename.c_str(), "rb"); - PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", - filename.c_str()); - fseek(file, 0, SEEK_END); - int64_t size = ftell(file); - PADDLE_MOBILE_ENFORCE(size > 0, "size is too small"); - rewind(file); - auto *data = new char[size]; - size_t bytes_read = fread(data, 1, size, file); - PADDLE_MOBILE_ENFORCE(bytes_read == size, - "read binary file bytes do not match with fseek"); - fclose(file); - return data; -} - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - auto model_path = std::string(g_genet_combine) + "/model"; - auto params_path = std::string(g_genet_combine) + "/params"; - uint8_t *bufModel = nullptr; - size_t sizeBuf = ReadBuffer(model_path.c_str(), &bufModel); - uint8_t *bufParams = nullptr; - - std::cout << "sizeBuf: " << sizeBuf << std::endl; - size_t sizeParams = ReadBuffer(params_path.c_str(), &bufParams); - std::cout << "sizeParams: " << sizeParams << std::endl; - - paddle_mobile.LoadCombinedMemory(sizeBuf, bufModel, sizeParams, bufParams); - return 0; -} diff --git a/mobile/test/framework/test_load_memory_inference_api.cpp b/mobile/test/framework/test_load_memory_inference_api.cpp deleted file mode 100644 index 5b2773f8f1..0000000000 --- a/mobile/test/framework/test_load_memory_inference_api.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include -#include "../test_helper.h" -#include "io/paddle_inference_api.h" - -static size_t ReadBuffer(const char *file_name, uint8_t **out) { - FILE *fp; - fp = fopen(file_name, "rb"); - PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name); - fseek(fp, 0, SEEK_END); - auto size = static_cast(ftell(fp)); - rewind(fp); - DLOG << "model size: " << size; - *out = reinterpret_cast(malloc(size)); - size_t cur_len = 0; - size_t nread; - while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) { - cur_len += nread; - } - fclose(fp); - return cur_len; -} - -static char *Get_binary_data(std::string filename) { - FILE *file = fopen(filename.c_str(), "rb"); - PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", - filename.c_str()); - fseek(file, 0, SEEK_END); - int64_t size = ftell(file); - PADDLE_MOBILE_ENFORCE(size > 0, "size is too small"); - rewind(file); - auto *data = new char[size]; - size_t bytes_read = fread(data, 1, size, file); - PADDLE_MOBILE_ENFORCE(bytes_read == size, - "read binary file bytes do not match with fseek"); - fclose(file); - return data; -} - -paddle_mobile::PaddleMobileConfig GetConfig() { - paddle_mobile::PaddleMobileConfig config; - config.precision = paddle_mobile::PaddleMobileConfig::FP32; - config.device = paddle_mobile::PaddleMobileConfig::kGPU_CL; - const std::shared_ptr &memory_pack = - std::make_shared(); - auto model_path = std::string(g_mobilenet_combined) + "/model"; - auto params_path = std::string(g_mobilenet_combined) + "/params"; - memory_pack->model_size = - ReadBuffer(model_path.c_str(), &memory_pack->model_buf); - std::cout << "sizeBuf: " << memory_pack->model_size << std::endl; - memory_pack->combined_params_size = - ReadBuffer(params_path.c_str(), &memory_pack->combined_params_buf); - std::cout << "sizeParams: " << memory_pack->combined_params_size << std::endl; - memory_pack->from_memory = true; - config.memory_pack = *memory_pack; - config.thread_num = 4; - return config; -} -int main() { - paddle_mobile::PaddleMobileConfig config = GetConfig(); - auto predictor = paddle_mobile::CreatePaddlePredictor< - paddle_mobile::PaddleMobileConfig, - paddle_mobile::PaddleEngineKind::kPaddleMobile>(config); - return 0; -} diff --git a/mobile/test/framework/test_optimize.cpp b/mobile/test/framework/test_optimize.cpp deleted file mode 100644 index 0392020789..0000000000 --- a/mobile/test/framework/test_optimize.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "framework/loader.h" -#include "framework/program/program-optimize/node.h" -#include "framework/program/program-optimize/program_optimize.h" - -int main() { - paddle_mobile::framework::Loader loader; - // "../../../test/models/googlenet" - auto program = loader.Load(g_mobilenet_ssd, true); - paddle_mobile::framework::ProgramOptimize optimize; - // program.originProgram->Description("origin"); - auto optimize_program = optimize.FusionOptimize(program.originProgram); - if (optimize_program != nullptr) { - // optimize_program->Description("optimize"); - } else { - LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null"; - } - return 0; -} diff --git a/mobile/test/net/test_alexnet.cpp b/mobile/test/net/test_alexnet.cpp deleted file mode 100644 index 50053fe82f..0000000000 --- a/mobile/test/net/test_alexnet.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", - // std::string(g_mobilenet_detect) + "/params", true); - - auto isok = paddle_mobile.Load(g_alexnet, true); - if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - std::vector input; - std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - auto vec_result = paddle_mobile.Predict(input, dims); - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - - // 预热十次 - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - DLOG << vec_result; - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - - std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " - "是否存在?" - << std::endl; - return 0; -} diff --git a/mobile/test/net/test_benchmark.cpp b/mobile/test/net/test_benchmark.cpp deleted file mode 100644 index 19d37eeded..0000000000 --- a/mobile/test/net/test_benchmark.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main(int argc, char* argv[]) { - if (argc < 4) { - std::cout << "Usage: " << std::endl - << "./test_benchmark fluid_model feed_shape thread_num [use_fuse]" - << std::endl; - std::cout << "use_fuse: optional, bool, default is 1\n"; - return 1; - } - bool optimize = true; - char* fluid_model = argv[1]; - char* feed_shape = argv[2]; - int thread_num = atoi(argv[3]); - if (argc == 5) { - optimize = atoi(argv[4]); - } - - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(thread_num); - auto time1 = time(); - // if (paddle_mobile.Load(fluid_model, optimize, false, 1, true)) { - if (paddle_mobile.Load(std::string(fluid_model) + "/model", - std::string(fluid_model) + "/params", optimize, false, - 1, true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time2) << "ms\n"; - paddle_mobile::framework::Tensor input; - std::shared_ptr output; - std::vector dims{1, 3, 224, 224}; - if (feed_shape) { - sscanf(feed_shape, "%lld,%lld,%lld,%lld", &dims[0], &dims[1], &dims[2], - &dims[3]); - } - std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", " - << dims[2] << ", " << dims[3] << "]\n"; - paddle_mobile::framework::DDim in_shape = - paddle_mobile::framework::make_ddim(dims); - SetupTensor(&input, in_shape, 0.f, 255.f); - // warmup - for (int i = 0; i < 2; ++i) { - paddle_mobile.Predict(input); - } - auto time3 = time(); - int test_count = 100; - for (int i = 0; i < test_count; ++i) { - paddle_mobile.Predict(input); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / test_count - << "ms\n"; - std::ostringstream os("output tensor size: "); - output = paddle_mobile.Fetch(); - os << output->numel() << "\n" << output->data()[0]; - for (int i = 1; i < output->numel(); ++i) { - os << ", " << output->data()[i]; - } - std::string output_str = os.str(); - // std::cout << output_str << std::endl; - } - return 0; -} diff --git a/mobile/test/net/test_eng.cpp b/mobile/test/net/test_eng.cpp deleted file mode 100644 index 67b13f1242..0000000000 --- a/mobile/test/net/test_eng.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { -#ifdef PADDLE_MOBILE_CPU - paddle_mobile::PaddleMobile paddle_mobile; -#endif - // paddle_mobile.SetThreadNum(4); - auto time1 = time(); - if (paddle_mobile.Load(std::string(g_eng) + "/model", - std::string(g_eng) + "/params", true, false, 1, - true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - std::vector dims{1, 1, 48, 400}; - LoDTensor input_tensor; - SetupTensor(&input_tensor, {1, 1, 48, 400}, static_cast(0), - static_cast(1)); - - std::vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); - // 预热十次 - for (int i = 0; i < 1; ++i) { - paddle_mobile.Predict(input_tensor); - } - auto time3 = time(); - for (int i = 0; i < 1; ++i) { - paddle_mobile.Predict(input_tensor); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) << "ms" - << std::endl; - } - return 0; -} diff --git a/mobile/test/net/test_genet_combine.cpp b/mobile/test/net/test_genet_combine.cpp deleted file mode 100644 index e6b0505a67..0000000000 --- a/mobile/test/net/test_genet_combine.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - if (paddle_mobile.Load(std::string(g_genet_combine) + "/model", - std::string(g_genet_combine) + "/params", true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - std::vector input; - std::vector dims{1, 3, 128, 128}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - // 预热一次 - auto vec_result = paddle_mobile.Predict(input, dims); - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - std::cout - << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?" - << std::endl; - return 0; -} diff --git a/mobile/test/net/test_gesture.cpp b/mobile/test/net/test_gesture.cpp deleted file mode 100644 index 596d50350e..0000000000 --- a/mobile/test/net/test_gesture.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_helper.h" -#include "../test_include.h" - -const int max_run_times = 10; - -int main(int argc, char **argv) { - if (argc < 3) { - std::cerr - << "Usage: ./test_ocr [detect_model_dir|recog_model_dir] image_path" - << std::endl; - return 1; - } - std::string model_dir = argv[1]; - std::string image_path = argv[2]; - - // init input, output params - std::vector input_vec; - std::vector input_shape; - std::vector output_fetch_nodes; - int PRINT_NODE_ELEM_NUM = 10; - - input_shape.emplace_back(1); - input_shape.emplace_back(3); - input_shape.emplace_back(192); - input_shape.emplace_back(192); - output_fetch_nodes.emplace_back("detection_output_0.tmp_0"); - std::shared_ptr outputs[output_fetch_nodes.size()]; - - // init paddle instance - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(1); - std::cout << "start load " << std::endl; - auto load_success = paddle_mobile.Load(std::string(model_dir) + "/model", - std::string(model_dir) + "/params", - true, false, 1, true); - std::cout << "load_success:" << load_success << std::endl; - // input image raw tensor, generated by - // [scripts](tools/python/imagetools/img2nchw.py) - std::cout << "image_path: " << image_path << std::endl; - std::cout << "input_shape: " << input_shape[0] << ", " << input_shape[1] - << ", " << input_shape[2] << ", " << input_shape[3] << std::endl; - GetInput(image_path, &input_vec, input_shape); - - // model predict - auto pred_start_time = paddle_mobile::time(); - for (int run_idx = 0; run_idx < max_run_times; ++run_idx) { - paddle_mobile.Predict(input_vec, input_shape); - for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) { - auto fetch_name = output_fetch_nodes[out_idx]; - outputs[out_idx] = paddle_mobile.Fetch(fetch_name); - } - } - auto pred_end_time = paddle_mobile::time(); - - // inference time - double pred_time = - paddle_mobile::time_diff(pred_start_time, pred_end_time) / max_run_times; - std::cout << "predict time(ms): " << pred_time << std::endl; - - // output result - for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) { - std::string node_id = output_fetch_nodes[out_idx]; - auto node_lod_tensor = outputs[out_idx]; - int node_elem_num = node_lod_tensor->numel(); - float *node_ptr = node_lod_tensor->data(); - std::cout << "==== output_fetch_nodes[" << out_idx - << "] =====" << std::endl; - std::cout << "node_id: " << node_id << std::endl; - std::cout << "node_elem_num: " << node_elem_num << std::endl; - std::cout << "PRINT_NODE_ELEM_NUM: " << PRINT_NODE_ELEM_NUM << std::endl; - PRINT_NODE_ELEM_NUM = - (node_elem_num > PRINT_NODE_ELEM_NUM) ? PRINT_NODE_ELEM_NUM : 0; - for (int eidx = 0; eidx < PRINT_NODE_ELEM_NUM; ++eidx) { - std::cout << node_id << "[" << eidx << "]: " << node_ptr[eidx] - << std::endl; - } - std::cout << std::endl; - } - - return 0; -} diff --git a/mobile/test/net/test_googlenet.cpp b/mobile/test/net/test_googlenet.cpp deleted file mode 100644 index ea6c6ce155..0000000000 --- a/mobile/test/net/test_googlenet.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main(int argc, char *argv[]) { - if (argc < 4) { - std::cout << "Usage: ./test_googlenet fluid-model input-image image-shape " - "[thread-num] [fusion]\n" - << " fluid-model: fluid model path. \n" - << " input-image: input raw image path. \n" - << " image-shape: input tensor shape, such as 1,3,224,224.\n" - << " thread-num: optional int, threads count, default is 1.\n" - << " fusion: optional bool, default is 0.\n"; - return 1; - } - int thread_num = 1; - bool optimize = false; - char *fluid_model = argv[1]; - char *input_img = argv[2]; - char *feed_shape = argv[3]; - if (argc >= 5) { - thread_num = atoi(argv[4]); - } - if (argc >= 6) { - optimize = atoi(argv[5]); - } -#ifdef PADDLE_MOBILE_FPGA - paddle_mobile::PaddleMobile paddle_mobile; -#endif -#ifdef PADDLE_MOBILE_CPU - paddle_mobile::PaddleMobile paddle_mobile; -#endif - paddle_mobile.SetThreadNum(thread_num); - auto time1 = time(); - std::vector output; - if (paddle_mobile.Load(fluid_model, optimize, false, 1, true)) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" - << std::endl; - std::vector input; - std::vector dims{1, 3, 224, 224}; - if (feed_shape) { - sscanf(feed_shape, "%lld,%lld,%lld,%lld", &dims[0], &dims[1], &dims[2], - &dims[3]); - } - std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", " - << dims[2] << ", " << dims[3] << "]" << std::endl; - - GetInput(input_img, &input, dims); - - // warmup - for (int i = 0; i < 10; ++i) { - output = paddle_mobile.Predict(input, dims); - } - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - output = paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - std::cout << "predict cost: " << time_diff(time3, time4) / 10 << "ms\n"; - - std::ostringstream os; - os << output[0]; - for (int i = 1; i < output.size(); ++i) { - os << ", " << output[i]; - } - DLOG << os.str(); - } - return 0; -} diff --git a/mobile/test/net/test_googlenet_quali.cpp b/mobile/test/net/test_googlenet_quali.cpp deleted file mode 100644 index 28cb6207d7..0000000000 --- a/mobile/test/net/test_googlenet_quali.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { -#ifdef PADDLE_MOBILE_FPGA - paddle_mobile::PaddleMobile paddle_mobile; -#endif - -#ifdef PADDLE_MOBILE_CPU - paddle_mobile::PaddleMobile paddle_mobile; -#endif - - paddle_mobile.SetThreadNum(4); - bool optimize = true; - bool quli = true; - auto time1 = time(); - auto isok = paddle_mobile.Load(std::string(g_googlenet_quali) + "/model", - std::string(g_googlenet_quali) + "/params", - optimize, quli); - if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; - std::vector input; - std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224, &input, dims); - // 预热十次 - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - return 0; -} diff --git a/mobile/test/net/test_googlenetv1_combine.cpp b/mobile/test/net/test_googlenetv1_combine.cpp deleted file mode 100644 index 9aab25afd2..0000000000 --- a/mobile/test/net/test_googlenetv1_combine.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - if (paddle_mobile.Load(std::string(g_googlenetv1_combined) + "/model", - std::string(g_googlenetv1_combined) + "/params", - false)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - std::vector input; - std::vector dims{1, 3, 160, 160}; - GetInput(g_img, &input, dims); - - for (int i = 0; i < input.size(); i += 1000) { - std::cout << input[i] << std::endl; - } - // auto vec_result = paddle_mobile.Predict(input, dims); - // std::vector::iterator biggest = - // std::max_element(std::begin(vec_result), std::end(vec_result)); - // std::cout << " Max element is " << *biggest << " at position " - // << std::distance(std::begin(vec_result), biggest) << - // std::endl; - - // // 预热十次 - // for (int i = 0; i < 1; ++i) { - // auto vec_result = paddle_mobile.Predict(input, dims); - // } - auto time3 = time(); - - auto vec_result = paddle_mobile.Predict(input, dims); - - for (int j = 0; j < vec_result.size(); ++j) { - std::cout << j << " : " << vec_result[j] << std::endl; - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms" - << std::endl; - } - - return 0; -} diff --git a/mobile/test/net/test_inceptionv4.cpp b/mobile/test/net/test_inceptionv4.cpp deleted file mode 100644 index fbbc9dd39e..0000000000 --- a/mobile/test/net/test_inceptionv4.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", - // std::string(g_mobilenet_detect) + "/params", true); - - auto isok = paddle_mobile.Load(g_inceptionv4, true); - if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - std::vector input; - std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - auto vec_result = paddle_mobile.Predict(input, dims); - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - - // 预热十次 - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - // DLOG << vec_result; - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - - std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " - "是否存在?" - << std::endl; - return 0; -} diff --git a/mobile/test/net/test_inference_ercy.cpp b/mobile/test/net/test_inference_ercy.cpp deleted file mode 100644 index 76997bcb8f..0000000000 --- a/mobile/test/net/test_inference_ercy.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kGPU_CL; - config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST; - - config.prog_file = "../models/ercy/model"; - config.param_file = "../models/ercy/params"; - config.lod_mode = false; - config.load_when_predict = false; - return config; -} - -int main() { - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - // reliable - int re_len = 1 * 1 * 64 * 72; - std::vector re_v; - std::vector re_dims{1, 1, 64, 72}; - GetInput(g_test_image_1x3x224x224, &re_v, re_dims); - - PaddleTensor re; - re.shape = std::vector({1, 1, 64, 72}); - re.data = PaddleBuf(re_v.data(), re_len * sizeof(float)); - re.dtype = PaddleDType::FLOAT32; - re.layout = LayoutType::LAYOUT_CHW; - - // grid - int grid_len = 1 * 64 * 72 * 2; - std::vector grid_v; - std::vector grid_dims{1, 64, 72, 2}; - GetInput(g_test_image_1x3x224x224, &grid_v, grid_dims); - - PaddleTensor grid; - grid.shape = std::vector({1, 64, 72, 2}); - grid.data = PaddleBuf(grid_v.data(), grid_len * sizeof(float)); - grid.dtype = PaddleDType::FLOAT32; - grid.layout = LayoutType::LAYOUT_CHW; - - // last_input - int last_len = 1 * 128 * 64 * 72; - std::vector last_v; - std::vector last_dims{1, 128, 64, 72}; - GetInput(g_test_image_1x3x224x224, &last_v, last_dims); - - PaddleTensor last; - last.shape = std::vector({1, 128, 64, 72}); - last.data = PaddleBuf(last_v.data(), last_len * sizeof(float)); - last.dtype = PaddleDType::FLOAT32; - last.layout = LayoutType::LAYOUT_CHW; - - // input_rgb - int input_rgb_len = 1 * 4 * 256 * 288; - std::vector input_rgb_v; - std::vector input_rgb_dims{1, 4, 256, 288}; - GetInput(g_test_image_1x3x224x224, &input_rgb_v, input_rgb_dims); - - PaddleTensor input_rgb; - input_rgb.shape = std::vector({1, 4, 256, 288}); - input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float)); - input_rgb.dtype = PaddleDType::FLOAT32; - input_rgb.layout = LayoutType::LAYOUT_CHW; - - PaddleTensor output0; - output0.shape = std::vector({}); - output0.data = PaddleBuf(); - output0.dtype = PaddleDType::FLOAT32; - output0.layout = LayoutType::LAYOUT_CHW; - - PaddleTensor output1; - output1.shape = std::vector({}); - output1.data = PaddleBuf(); - output1.dtype = PaddleDType::FLOAT32; - output1.layout = LayoutType::LAYOUT_CHW; - - predictor->Feed("reliable", re); - predictor->Feed("grid", grid); - predictor->Feed("last_input", last); - predictor->Feed("input_rgb", input_rgb); - predictor->Run(); - predictor->Fetch("save_infer_model/scale_0", &output0); - predictor->Fetch("save_infer_model/scale_1", &output1); - - float* out_ptr0 = reinterpret_cast(output0.data.data()); - float* out_ptr1 = reinterpret_cast(output1.data.data()); - std::cout << " print output0 : " << std::endl; - int numel = output0.data.length() / sizeof(float); - int stride = numel / 20; - stride = stride > 0 ? stride : 1; - for (size_t j = 0; j < numel; j += stride) { - std::cout << out_ptr0[j] << " "; - } - std::cout << std::endl; - - std::cout << " print output1 : " << std::endl; - numel = output1.data.length() / sizeof(float); - stride = numel / 20; - stride = stride > 0 ? stride : 1; - for (size_t j = 0; j < numel; j += stride) { - std::cout << out_ptr1[j] << " "; - } - std::cout << std::endl; - - return 0; -} diff --git a/mobile/test/net/test_inference_imfix.cpp b/mobile/test/net/test_inference_imfix.cpp deleted file mode 100644 index dacc35f7d0..0000000000 --- a/mobile/test/net/test_inference_imfix.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kGPU_CL; - config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST; - - config.prog_file = "../models/imagefixmodel/model"; - config.param_file = "../models/imagefixmodel/params"; - config.lod_mode = false; - config.load_when_predict = false; - return config; -} - -int main() { - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - // factor - int input_rgb_len = 1 * 3 * 256 * 256; - std::vector input_rgb_v(input_rgb_len, 1); - // SetupData(input_rgb_v.data(), input_rgb_len, 0.f, 1.f); - - PaddleTensor input_rgb; - input_rgb.shape = std::vector({1, 3, 256, 256}); - input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float)); - input_rgb.dtype = PaddleDType::FLOAT32; - input_rgb.layout = LayoutType::LAYOUT_CHW; - - // remap - int input_mask_len = 1 * 3 * 256 * 256; - std::vector input_mask_v(input_mask_len, 1); - // SetupData(input_mask_v.data(), input_mask_len, 0.f, 1.f); - - PaddleTensor input_mask; - input_mask.shape = std::vector({1, 3, 256, 256}); - input_mask.data = - PaddleBuf(input_mask_v.data(), input_mask_len * sizeof(float)); - input_mask.dtype = PaddleDType::FLOAT32; - input_mask.layout = LayoutType::LAYOUT_CHW; - - PaddleTensor output0; - output0.shape = std::vector({}); - output0.data = PaddleBuf(); - output0.dtype = PaddleDType::FLOAT32; - output0.layout = LayoutType::LAYOUT_CHW; - - // PaddleTensor output1; - // output1.shape = std::vector({}); - // output1.data = PaddleBuf(); - // output1.dtype = PaddleDType::FLOAT32; - // output1.layout = LayoutType::LAYOUT_CHW; - - // PaddleTensor output2; - // output2.shape = std::vector({}); - // output2.data = PaddleBuf(); - // output2.dtype = PaddleDType::FLOAT32; - // output2.layout = LayoutType::LAYOUT_CHW; - - // PaddleTensor output3; - // output3.shape = std::vector({}); - // output3.data = PaddleBuf(); - // output3.dtype = PaddleDType::FLOAT32; - // output3.layout = LayoutType::LAYOUT_CHW; - std::cout << "feed : " << std::endl; - - predictor->Feed("input_rgb", input_rgb); - - std::cout << "feed : " << std::endl; - - predictor->Feed("input_mask", input_mask); - - std::cout << "run : " << std::endl; - - predictor->Run(); - - std::cout << "fetch : " << std::endl; - - predictor->Fetch("save_infer_model/scale_0", &output0); - - float* out_ptr0 = reinterpret_cast(output0.data.data()); - std::cout << " print output0 : " << std::endl; - int numel = output0.data.length() / sizeof(float); - int stride = numel / 20; - stride = stride > 0 ? stride : 1; - for (size_t j = 0; j < numel; j += stride) { - std::cout << out_ptr0[j] << " "; - } - std::cout << std::endl; - - return 0; -} diff --git a/mobile/test/net/test_inference_m2fm.cpp b/mobile/test/net/test_inference_m2fm.cpp deleted file mode 100644 index b40c81ee54..0000000000 --- a/mobile/test/net/test_inference_m2fm.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kGPU_CL; - config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST; - - config.prog_file = "../models/gan_yanlong_check2/model"; - config.param_file = "../models/gan_yanlong_check2/params"; - config.lod_mode = false; - config.load_when_predict = false; - return config; -} - -int main() { - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - // factor - int factor_len = 1 * 256 * 1 * 1; - std::vector factor_v; - std::vector factor_dims{1, 256, 1, 1}; - GetInput(g_test_image_1x3x224x224, &factor_v, factor_dims); - - PaddleTensor factor; - factor.shape = std::vector({1, 256, 1, 1}); - factor.data = PaddleBuf(factor_v.data(), factor_len * sizeof(float)); - factor.dtype = PaddleDType::FLOAT32; - factor.layout = LayoutType::LAYOUT_CHW; - - // remap - int remap_len = 1 * 256 * 256 * 2; - std::vector remap_v; - std::vector remap_dims{1, 256, 256, 2}; - GetInput(g_test_image_1x3x224x224, &remap_v, remap_dims); - - PaddleTensor remap; - remap.shape = std::vector({1, 256, 256, 2}); - remap.data = PaddleBuf(remap_v.data(), remap_len * sizeof(float)); - remap.dtype = PaddleDType::FLOAT32; - remap.layout = LayoutType::LAYOUT_CHW; - - // image - int image_len = 1 * 3 * 256 * 256; - std::vector image_v; - std::vector image_dims{1, 3, 256, 256}; - GetInput(g_test_image_1x3x224x224, &image_v, image_dims); - - PaddleTensor image; - image.shape = std::vector({1, 3, 256, 256}); - image.data = PaddleBuf(image_v.data(), image_len * sizeof(float)); - image.dtype = PaddleDType::FLOAT32; - image.layout = LayoutType::LAYOUT_CHW; - - PaddleTensor output0; - output0.shape = std::vector({}); - output0.data = PaddleBuf(); - output0.dtype = PaddleDType::FLOAT32; - output0.layout = LayoutType::LAYOUT_CHW; - - PaddleTensor output1; - output1.shape = std::vector({}); - output1.data = PaddleBuf(); - output1.dtype = PaddleDType::FLOAT32; - output1.layout = LayoutType::LAYOUT_CHW; - - PaddleTensor output2; - output2.shape = std::vector({}); - output2.data = PaddleBuf(); - output2.dtype = PaddleDType::FLOAT32; - output2.layout = LayoutType::LAYOUT_CHW; - - PaddleTensor output3; - output3.shape = std::vector({}); - output3.data = PaddleBuf(); - output3.dtype = PaddleDType::FLOAT32; - output3.layout = LayoutType::LAYOUT_CHW; - - predictor->Feed("x2paddle_mul_factor", factor); - predictor->Feed("x2paddle_base_remap", remap); - predictor->Feed("x2paddle_image", image); - predictor->Run(); - predictor->Fetch("save_infer_model/scale_0", &output0); - predictor->Fetch("save_infer_model/scale_1", &output1); - predictor->Fetch("save_infer_model/scale_2", &output2); - predictor->Fetch("save_infer_model/scale_3", &output3); - - float* out_ptr0 = reinterpret_cast(output0.data.data()); - float* out_ptr1 = reinterpret_cast(output1.data.data()); - std::cout << " print output0 : " << std::endl; - int numel = output0.data.length() / sizeof(float); - int stride = numel / 20; - stride = stride > 0 ? stride : 1; - for (size_t j = 0; j < numel; j += stride) { - std::cout << out_ptr0[j] << " "; - } - std::cout << std::endl; - - std::cout << " print output1 : " << std::endl; - numel = output1.data.length() / sizeof(float); - stride = numel / 20; - stride = stride > 0 ? stride : 1; - for (size_t j = 0; j < numel; j += stride) { - std::cout << out_ptr1[j] << " "; - } - std::cout << std::endl; - - return 0; -} diff --git a/mobile/test/net/test_inference_pre_post.cpp b/mobile/test/net/test_inference_pre_post.cpp deleted file mode 100644 index 39dc942920..0000000000 --- a/mobile/test/net/test_inference_pre_post.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "io/paddle_inference_api.h" - -using namespace paddle_mobile; // NOLINT - -PaddleMobileConfig GetConfig() { - PaddleMobileConfig config; - config.precision = PaddleMobileConfig::FP32; - config.device = PaddleMobileConfig::kGPU_CL; - config.pre_post_type = PaddleMobileConfig::UINT8_255; - - config.prog_file = "../models/superv2/model"; - config.param_file = "../models/superv2/params"; - config.lod_mode = false; - config.load_when_predict = true; - config.cl_path = "/data/local/tmp/bin"; - return config; -} - -int main() { - PaddleMobileConfig config = GetConfig(); - auto predictor = - CreatePaddlePredictor(config); - - int input_length = 1 * 1 * 300 * 300; - int output_length = input_length; - - uint8_t data_ui[300 * 300]; - for (int i = 0; i < input_length; ++i) { - data_ui[i] = i % 256; - } - - PaddleTensor input; - input.shape = std::vector({1, 1, 300, 300}); - input.data = PaddleBuf(data_ui, sizeof(data_ui)); - input.dtype = PaddleDType::UINT8; - input.layout = LayoutType::LAYOUT_CHW; - std::vector inputs(1, input); - - PaddleTensor output; - output.shape = std::vector({}); - output.data = PaddleBuf(); - output.dtype = PaddleDType::UINT8; - output.layout = LayoutType::LAYOUT_CHW; - std::vector outputs(1, output); - - std::cout << " print input : " << std::endl; - int stride = input_length / 20; - stride = stride > 0 ? stride : 1; - for (size_t j = 0; j < input_length; j += stride) { - std::cout << (unsigned)data_ui[j] << " "; - } - std::cout << std::endl; - - predictor->Run(inputs, &outputs); - - std::cout << " print output : " << std::endl; - uint8_t *data_o = static_cast(outputs[0].data.data()); - int numel = outputs[0].data.length() / sizeof(uint8_t); - stride = numel / 20; - stride = stride > 0 ? stride : 1; - for (size_t j = 0; j < numel; j += stride) { - std::cout << (unsigned)data_o[j] << " "; - } - std::cout << std::endl; - - return 0; -} diff --git a/mobile/test/net/test_mobilenet+ssd.cpp b/mobile/test/net/test_mobilenet+ssd.cpp deleted file mode 100644 index 85083ca441..0000000000 --- a/mobile/test/net/test_mobilenet+ssd.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - auto isok = paddle_mobile.Load( - std::string(g_mobilenet_ssd_gesture) + "/model", - std::string(g_mobilenet_ssd_gesture) + "/params", true); - // auto isok = paddle_mobile.Load(g_mobilenet_ssd, false); - if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; - - std::vector input; - std::vector dims{1, 3, 300, 300}; - GetInput(g_hand, &input, dims); - - // 预热十次 - for (int i = 0; i < 10; ++i) { - auto output = paddle_mobile.Predict(input, dims); - } - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - auto output = paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - return 0; -} diff --git a/mobile/test/net/test_mobilenet.cpp b/mobile/test/net/test_mobilenet.cpp deleted file mode 100644 index 5cce53e866..0000000000 --- a/mobile/test/net/test_mobilenet.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = paddle_mobile::time(); - // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", - // std::string(g_mobilenet_detect) + "/params", true); - - auto isok = paddle_mobile.Load(g_mobilenet, true); - if (isok) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" - << std::endl; - - std::vector input; - std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - auto vec_result = paddle_mobile.Predict(input, dims); - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - - // 预热十次 - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - auto time3 = paddle_mobile::time(); - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - DLOG << vec_result; - auto time4 = paddle_mobile::time(); - std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 - << "ms" << std::endl; - } - - std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " - "是否存在?" - << std::endl; - return 0; -} diff --git a/mobile/test/net/test_mobilenet_025_fssd.cpp b/mobile/test/net/test_mobilenet_025_fssd.cpp deleted file mode 100644 index c0d037ceb0..0000000000 --- a/mobile/test/net/test_mobilenet_025_fssd.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main(int argc, char **argv) { - int times = 10; - if (argc <= 1) { - times = 10; - std::cout << "没有输入 , 使用默认10次 " << times << std::endl; - } else { - std::string arstr = argv[1]; - times = std::stoi(arstr); - std::cout << "input times: " << times << std::endl; - } - - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(1); - auto isok = - paddle_mobile.Load(std::string(g_fluid_fssd_new) + "/model", - std::string(g_fluid_fssd_new) + "/params", true); - if (isok) { - std::vector input; - std::vector dims{1, 3, 160, 160}; - GetInput(g_imgfssd_ar1, &input, dims); - std::cout << "预热10次....." << std::endl; - - // 预热十次 - for (int i = 0; i < 10; ++i) { - auto output = paddle_mobile.Predict(input, dims); - } - std::cout << "开始....." << std::endl; - - double time_sum = 0; - - for (int i = 0; i < times; ++i) { - auto time3 = time(); - auto output = paddle_mobile.Predict(input, dims); - auto time4 = time(); - double timeDiff = time_diff(time3, time4); - time_sum += timeDiff; - std::cout << "第" << i << "次" - << "predict cost :" << timeDiff << "ms" << std::endl; - } - std::cout << "平均时间:" << time_sum / times << "ms" << std::endl; - } - return 0; -} diff --git a/mobile/test/net/test_mobilenet_GPU.cpp b/mobile/test/net/test_mobilenet_GPU.cpp deleted file mode 100644 index 8848f23d39..0000000000 --- a/mobile/test/net/test_mobilenet_GPU.cpp +++ /dev/null @@ -1,140 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../../src/common/types.h" -#include "../test_helper.h" -#include "../test_include.h" - -int main(int argc, char **argv) { - // init input args - string model_dir = g_mobilenet; - int64_t N = 1; - int64_t C = 3; - int64_t H = 224; - int64_t W = 224; - int repeats = 10; - int warmup = 10; - int print_output_elem = 0; - - std::cout << "argc:" << argc << std::endl; - if (argc > 1 && argc < 9) { - std::cout << "usage:" << argv[0] << "\n" - << " \n" - << " \n" - << " \n" - << " \n" - << " \n" - << " \n" - << " \n" - << " " << std::endl; - return 0; - } - - if (argc >= 9) { - model_dir = argv[1]; - N = atoi(argv[2]); - C = atoi(argv[3]); - H = atoi(argv[4]); - W = atoi(argv[5]); - repeats = atoi(argv[6]); - warmup = atoi(argv[7]); - print_output_elem = atoi(argv[8]); - } - - std::cout << "input shape(NCHW):" << N << " " << C << " " << H << " " << W - << std::endl; - std::cout << "repeats:" << repeats << std::endl; - std::cout << "model_dir:" << model_dir << std::endl; - - paddle_mobile::PaddleMobile paddle_mobile; - // paddle_mobile.SetThreadNum(4); - auto load_start = paddle_mobile::time(); -#ifdef PADDLE_MOBILE_CL - paddle_mobile.SetCLPath("/data/local/tmp/bin"); -#endif - - auto load_model_status = paddle_mobile.Load(std::string(model_dir), true); - if (!load_model_status) { - std::cout << "failed to load model from:" << model_dir << std::endl; - return 0; - } - - auto load_end = paddle_mobile::time(); - std::cout << "load cost:" << paddle_mobile::time_diff(load_start, load_end) - << " ms" << std::endl; - - // input tensor - std::vector input; - std::vector dims{N, C, H, W}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - // warmup - std::vector vec_result = paddle_mobile.Predict(input, dims); - for (int widx = 0; widx < warmup; ++widx) { - paddle_mobile.Predict(input, dims); - } - - // benchmark - float sum_duration = 0.0f; - float min_duration = 1e5f; - float max_duration = 1e-5f; - float ave_duration = -1; - for (int ridx = 0; ridx < repeats; ++ridx) { - auto start = paddle_mobile::time(); - vec_result = paddle_mobile.Predict(input, dims); - auto end = paddle_mobile::time(); - auto duration = paddle_mobile::time_diff(start, end); - sum_duration += duration; - min_duration = (duration > min_duration) ? min_duration : duration; - max_duration = (duration < max_duration) ? max_duration : duration; - std::cout << "ridx:" << ridx + 1 << "/" << repeats << " " << duration - << " ms" << std::endl; - } - - // benchmark result - ave_duration = sum_duration / static_cast(repeats); - - // output result - float output_sum = 0; - float output_ave = -1; - for (size_t oidx = 0; oidx < vec_result.size(); ++oidx) { - output_sum += vec_result[oidx]; - if (print_output_elem) { - std::cout << "out_idx:" << oidx << " " << vec_result[oidx] << std::endl; - } - } - output_ave = output_sum / static_cast(vec_result.size()); - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - - // summary - std::cout << "===== predict benchmark ====" << std::endl - << "run repeats:" << repeats << std::endl - << "sum_duration:" << sum_duration << " ms" << std::endl - << "ave_duration:" << ave_duration << " ms" << std::endl - << "max_duration:" << max_duration << " ms" << std::endl - << "min_duration:" << min_duration << " ms" << std::endl - << "\n===== predict result ====" << std::endl - << "output_sum:" << output_sum << std::endl - << "output_ave:" << output_ave << std::endl - << "output_size:" << vec_result.size() << std::endl - << "Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl - << "Note: 如果结果Nan请查看:" - " test/images/g_test_image_1x3x224x224_banana " - "是否存在?" - << std::endl; - return 0; -} diff --git a/mobile/test/net/test_mobilenet_combine.cpp b/mobile/test/net/test_mobilenet_combine.cpp deleted file mode 100644 index af00085b6d..0000000000 --- a/mobile/test/net/test_mobilenet_combine.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - - if (paddle_mobile.Load( - std::string(g_mobilenet_vision) + "/vision_mobilenet_model", - std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - std::vector input; - std::vector dims{1, 3, 224, 224}; - - GetInput(g_test_image_1x3x224x224_vision_mobilenet_input, &input, - dims); - - auto vec_result = paddle_mobile.Predict(input, dims); - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - - // 预热十次 - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - - auto time3 = time(); - for (int i = 0; i < 1; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - std::cout - << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?" - << std::endl; - return 0; -} diff --git a/mobile/test/net/test_mobilenet_male2fe.cpp b/mobile/test/net/test_mobilenet_male2fe.cpp deleted file mode 100644 index eb83b5bafe..0000000000 --- a/mobile/test/net/test_mobilenet_male2fe.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../../src/common/types.h" -#include "../test_helper.h" -#include "../test_include.h" - -void feed(PaddleMobile *paddle_mobile, const DDim &dims, - std::string image_path, std::string feed_name) { - float *input_data_array = new float[product(dims)]; - std::ifstream in(image_path, std::ios::in); - for (int i = 0; i < product(dims); i++) { - float num; - in >> num; - input_data_array[i] = num; - } - in.close(); - framework::Tensor input_tensor(input_data_array, dims); - DLOG << feed_name << " : " << input_tensor; - paddle_mobile->Feed(feed_name, input_tensor); -} - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - auto time1 = paddle_mobile::time(); -#ifdef PADDLE_MOBILE_CL - paddle_mobile.SetCLPath("/data/local/tmp/bin"); -#endif - - if (paddle_mobile.Load(std::string("../models/nanbiannv") + "/model", - std::string("../models/nanbiannv") + "/params", - true)) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" - << std::endl; - - std::vector input; - feed(&paddle_mobile, {1, 3, 256, 256}, "../images/input_1_3_256_256", - "image"); - - auto time3 = paddle_mobile::time(); - paddle_mobile.Predict(); - auto time4 = paddle_mobile::time(); - - std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) - << "ms" << std::endl; - } - - auto rgb = paddle_mobile.Fetch("rgb"); - auto mask = paddle_mobile.Fetch("mask"); - LOG(kLOG_INFO) << "rgb" << *rgb; - LOG(kLOG_INFO) << "mask" << *mask; - return 0; -} diff --git a/mobile/test/net/test_multi_inference_predict.cpp b/mobile/test/net/test_multi_inference_predict.cpp deleted file mode 100644 index 8d97fee8c3..0000000000 --- a/mobile/test/net/test_multi_inference_predict.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include // NOLINT -#include "../test_helper.h" -#include "../test_include.h" - -void fun_yolo(); -int fun_mobilenet(); -int main() { - paddle_mobile::PaddleMobile paddle_mobile2; - - // fun_yolo(); - // fun_mobilenet(); - - std::thread t1(fun_yolo); - std::thread t2(fun_mobilenet); - - t1.join(); - t2.join(); - - return 0; -} - -void fun_yolo() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - // ../../../test/models/googlenet - // ../../../test/models/mobilenet - auto time1 = time(); - if (paddle_mobile.Load(g_yolo, true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - vector dims{1, 3, 227, 227}; - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 227, 227}, static_cast(0), - static_cast(1)); - - vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); - - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - std::cout << "thread 1: predict cost :" << time_diff(time3, time4) / 10 - << "ms" << std::endl; - } -} - -int fun_mobilenet() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", - // std::string(g_mobilenet_detect) + "/params", true); - - auto isok = paddle_mobile.Load(g_mobilenet, true); - if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - vector input; - vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - auto vec_result = paddle_mobile.Predict(input, dims); - auto biggest = max_element(begin(vec_result), end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << distance(begin(vec_result), biggest) << std::endl; - - // 预热十次 - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - auto vec_result = paddle_mobile.Predict(input, dims); - } - DLOG << vec_result; - auto time4 = time(); - std::cout << "thread 2: predict cost :" << time_diff(time3, time4) / 10 - << "ms" << std::endl; - } - - std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " - "是否存在?" - << std::endl; - return 0; -} diff --git a/mobile/test/net/test_net.cpp b/mobile/test/net/test_net.cpp deleted file mode 100644 index 3d5386513b..0000000000 --- a/mobile/test/net/test_net.cpp +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "../test_helper.h" -#include "../test_include.h" - -void test(int argc, char *argv[]); - -int main(int argc, char *argv[]) { - test(argc, argv); - return 0; -} - -void test(int argc, char *argv[]) { - int arg_index = 1; - bool fuse = std::stoi(argv[arg_index]) == 1; - arg_index++; - bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1; - arg_index++; - bool quantification = std::stoi(argv[arg_index]) == 1; - arg_index++; - int quantification_fold = std::stoi(argv[arg_index]); - arg_index++; - paddle_mobile::PaddleMobileConfigInternal config; - config.memory_optimization_level = enable_memory_optimization - ? MemoryOptimizationWithoutFeeds - : NoMemoryOptimization; - - // save obfuscated model - // config.model_obfuscate_key = "asdf"; - // std::ofstream out_file("new-params", std::ofstream::binary); - // char *out_data = ReadFileToBuff("./checked_model/params"); - // int len = GetFileLength("./checked_model/params"); - // out_file.write(out_data, len); - // out_file.close(); - -#ifdef PADDLE_MOBILE_CL - // config.load_when_predict = true; - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetCLPath("/data/local/tmp/bin"); - std::cout << "testing opencl yyz " << std::endl; -#else - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetThreadNum(1); - std::cout << "testing cpu yyz " << std::endl; -#endif - - int dim_count = std::stoi(argv[arg_index]); - arg_index++; - int size = 1; - std::vector dims; - for (int i = 0; i < dim_count; i++) { - int64_t dim = std::stoi(argv[arg_index + i]); - size *= dim; - dims.push_back(dim); - } - arg_index += dim_count; - - bool is_lod = std::stoi(argv[arg_index]) == 1; - arg_index++; - paddle_mobile::framework::LoD lod{{}}; - if (is_lod) { - int lod_count = std::stoi(argv[arg_index]); - arg_index++; - for (int i = 0; i < lod_count; i++) { - int dim = std::stoi(argv[arg_index + i]); - lod[0].push_back(dim); - } - arg_index += lod_count; - } - - int var_count = std::stoi(argv[arg_index]); - arg_index++; - bool is_sample_step = std::stoi(argv[arg_index]) == 1; - arg_index++; - int sample_arg = std::stoi(argv[arg_index]); - int sample_step = sample_arg; - int sample_num = sample_arg; - arg_index++; - std::vector var_names; - for (int i = 0; i < var_count; i++) { - std::string var_name = argv[arg_index + i]; - var_names.push_back(var_name); - } - arg_index += var_count; - bool check_shape = std::stoi(argv[arg_index]) == 1; - arg_index++; - - auto time1 = time(); - if (paddle_mobile.Load("./checked_model/model", "./checked_model/params", - fuse, quantification, 1, is_lod, - quantification_fold)) { - auto time2 = time(); - std::cout << "auto-test" - << " load-time-cost :" << time_diff(time1, time2) << "ms" - << std::endl; - - float *input_data_array = new float[size]; - std::ifstream in("input.txt", std::ios::in); - for (int i = 0; i < size; i++) { - float num; - in >> num; - input_data_array[i] = num; - } - in.close(); - - auto time3 = time(); - // std::vector input_data; - // for (int i = 0; i < size; i++) { - // float num = input_data_array[i]; - // input_data.push_back(num); - // } - // paddle_mobile::framework::Tensor input_tensor(input_data, - // paddle_mobile::framework::make_ddim(dims)); - paddle_mobile::framework::Tensor input_tensor( - input_data_array, paddle_mobile::framework::make_ddim(dims)); - auto time4 = time(); - std::cout << "auto-test" - << " preprocess-time-cost :" << time_diff(time3, time4) << "ms" - << std::endl; - - paddle_mobile::framework::LoDTensor input_lod_tensor; - if (is_lod) { - input_lod_tensor.Resize(paddle_mobile::framework::make_ddim(dims)); - input_lod_tensor.set_lod(lod); - auto *tensor_data = input_lod_tensor.mutable_data(); - for (int i = 0; i < size; i++) { - tensor_data[i] = input_data_array[i]; - } - } - - // // 预热10次 - // for (int i = 0; i < 10; i++) { - // if (is_lod) { - // auto out = paddle_mobile.Predict(input_lod_tensor); - // } else { - // paddle_mobile.Feed(var_names[0], input_tensor); - // paddle_mobile.Predict(); - // } - // } - - // // 测速 - // auto time5 = time(); - // for (int i = 0; i < 50; i++) { - // if (is_lod) { - // auto out = paddle_mobile.Predict(input_lod_tensor); - // } else { - // paddle_mobile.Feed(var_names[0], input_tensor); - // paddle_mobile.Predict(); - // } - // } - // auto time6 = time(); - // std::cout << "auto-test" - // << " predict-time-cost " << time_diff(time5, time6) / 50 << - // "ms" - // << std::endl; - - // 测试正确性 - if (is_lod) { - auto out = paddle_mobile.Predict(input_lod_tensor); - } else { - paddle_mobile.Feed(var_names[0], input_tensor); - paddle_mobile.Predict(); - } -#ifdef PADDLE_MOBILE_CL - for (auto var_name : var_names) { - auto cl_image = paddle_mobile.FetchImage(var_name); - if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) { - continue; - } - auto len = cl_image->numel(); - if (len == 0) { - continue; - } - size_t width = cl_image->ImageDims()[0]; - size_t height = cl_image->ImageDims()[1]; - paddle_mobile::framework::half_t *image_data = - new paddle_mobile::framework::half_t[height * width * 4]; - cl_int err; - cl_mem image = cl_image->GetCLImage(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - err = clEnqueueReadImage(cl_image->CommandQueue(), image, CL_TRUE, origin, - region, 0, 0, image_data, 0, NULL, NULL); - CL_CHECK_ERRORS(err); - float *tensor_data = new float[cl_image->numel()]; - auto converter = cl_image->Converter(); - converter->ImageToNCHW(image_data, tensor_data, cl_image->ImageDims(), - cl_image->dims()); - - auto data = tensor_data; - std::string sample = ""; - if (check_shape) { - for (int i = 0; i < cl_image->dims().size(); i++) { - sample += " " + std::to_string(cl_image->dims()[i]); - } - } - if (!is_sample_step) { - sample_step = len / sample_num; - } - if (sample_step <= 0) { - sample_step = 1; - } - for (int i = 0; i < len; i += sample_step) { - sample += " " + std::to_string(data[i]); - } - std::cout << "auto-test" - << " var " << var_name << sample << std::endl; - } -#else - for (auto var_name : var_names) { - auto out = paddle_mobile.Fetch(var_name); - auto len = out->numel(); - if (len == 0) { - continue; - } - if (out->memory_size() == 0) { - continue; - } - if (out->type() == type_id()) { - auto data = out->data(); - std::string sample = ""; - if (check_shape) { - for (int i = 0; i < out->dims().size(); i++) { - sample += " " + std::to_string(out->dims()[i]); - } - } - if (!is_sample_step) { - sample_step = len / sample_num; - } - if (sample_step <= 0) { - sample_step = 1; - } - for (int i = 0; i < len; i += sample_step) { - sample += " " + std::to_string(data[i]); - } - std::cout << "auto-test" - << " var " << var_name << sample << std::endl; - } else if (out->type() == type_id()) { - auto data = out->data(); - std::string sample = ""; - if (check_shape) { - for (int i = 0; i < out->dims().size(); i++) { - sample += " " + std::to_string(out->dims()[i]); - } - } - if (!is_sample_step) { - sample_step = len / sample_num; - } - if (sample_step <= 0) { - sample_step = 1; - } - for (int i = 0; i < len; i += sample_step) { - sample += " " + std::to_string(data[i]); - } - std::cout << "auto-test" - << " var " << var_name << sample << std::endl; - } - } -#endif - std::cout << std::endl; - } -} diff --git a/mobile/test/net/test_net_benchmark.cpp b/mobile/test/net/test_net_benchmark.cpp deleted file mode 100644 index 396f293f76..0000000000 --- a/mobile/test/net/test_net_benchmark.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { -#ifdef PADDLE_MOBILE_CL - paddle_mobile::PaddleMobileConfigInternal config; - config.load_when_predict = false; - paddle_mobile::PaddleMobile paddle_mobile(config); -#else - paddle_mobile::PaddleMobile paddle_mobile; -#endif - paddle_mobile.SetThreadNum(1); - auto time1 = paddle_mobile::time(); - - auto isok = paddle_mobile.Load(std::string(g_mobilenet_combined) + "/model", - std::string(g_mobilenet_combined) + "/params", - true, false, 1, false); - if (isok) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" - << std::endl; - - std::vector input; - std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - paddle_mobile::framework::DDim ddim = - paddle_mobile::framework::make_ddim(dims); - Tensor feed_tensor(input, paddle_mobile::framework::make_ddim(dims)); - - // 预热十次 - for (int i = 0; i < 10; ++i) { - // auto vec_result = paddle_mobile.Predict(input, dims); - paddle_mobile.Feed("data", feed_tensor); - paddle_mobile.Predict(); - } - auto time3 = paddle_mobile::time(); - for (int i = 0; i < 100; ++i) { - // auto vec_result = paddle_mobile.Predict(input, dims); - paddle_mobile.Feed("data", feed_tensor); - paddle_mobile.Predict(); - } - auto time4 = paddle_mobile::time(); - std::cout << "predict cost :" - << paddle_mobile::time_diff(time3, time4) / 100 << "ms" - << std::endl; - } - - return 0; -} diff --git a/mobile/test/net/test_net_multi_feed.cpp b/mobile/test/net/test_net_multi_feed.cpp deleted file mode 100644 index 5c04a76ad3..0000000000 --- a/mobile/test/net/test_net_multi_feed.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef PADDLE_MOBILE_CL - -#include -#include -#include -#include "../test_helper.h" -#include "../test_include.h" - -void test(int argc, char *argv[]); - -void feed(PaddleMobile *paddle_mobile, const DDim &dims, - std::string feed_name) { - float *input_data_array = new float[product(dims)]; - std::ifstream in(feed_name, std::ios::in); - for (int i = 0; i < product(dims); i++) { - float num; - in >> num; - input_data_array[i] = num; - } - in.close(); - framework::Tensor input_tensor(input_data_array, dims); - DLOG << feed_name << " : " << input_tensor; - paddle_mobile->Feed(feed_name, input_tensor); -} -int main(int argc, char *argv[]) { - test(argc, argv); - return 0; -} - -void test(int argc, char *argv[]) { - int arg_index = 1; - bool fuse = std::stoi(argv[arg_index]) == 1; - arg_index++; - bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1; - arg_index++; - bool quantification = std::stoi(argv[arg_index]) == 1; - arg_index++; - int quantification_fold = std::stoi(argv[arg_index]); - arg_index++; - paddle_mobile::PaddleMobileConfigInternal config; - config.memory_optimization_level = enable_memory_optimization - ? MemoryOptimizationWithoutFeeds - : NoMemoryOptimization; - -#ifdef PADDLE_MOBILE_CL - // config.load_when_predict = true; - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetCLPath("/data/local/tmp/bin"); - std::cout << "testing opencl yyz " << std::endl; -#else - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetThreadNum(1); - std::cout << "testing cpu yyz " << std::endl; -#endif - - int dim_count = std::stoi(argv[arg_index]); - arg_index++; - int size = 1; - - arg_index += dim_count; - - bool is_lod = std::stoi(argv[arg_index]) == 1; - arg_index++; - paddle_mobile::framework::LoD lod{{}}; - if (is_lod) { - int lod_count = std::stoi(argv[arg_index]); - arg_index++; - for (int i = 0; i < lod_count; i++) { - int dim = std::stoi(argv[arg_index + i]); - lod[0].push_back(dim); - } - arg_index += lod_count; - } - - int var_count = std::stoi(argv[arg_index]); - arg_index++; - bool is_sample_step = std::stoi(argv[arg_index]) == 1; - arg_index++; - int sample_arg = std::stoi(argv[arg_index]); - int sample_step = sample_arg; - int sample_num = sample_arg; - arg_index++; - std::vector var_names; - for (int i = 0; i < var_count; i++) { - std::string var_name = argv[arg_index + i]; - var_names.push_back(var_name); - } - arg_index += var_count; - bool check_shape = std::stoi(argv[arg_index]) == 1; - arg_index++; - - auto time1 = time(); - if (paddle_mobile.Load("./checked_model/model", "./checked_model/params", - fuse, quantification, 1, is_lod, - quantification_fold)) { - auto time2 = time(); - std::cout << "auto-test" - << " load-time-cost :" << time_diff(time1, time2) << "ms" - << std::endl; - - feed(&paddle_mobile, {1, 4, 256, 288}, "input_rgb"); - feed(&paddle_mobile, {1, 128, 64, 72}, "last_input"); - feed(&paddle_mobile, {1, 64, 72, 2}, "grid"); - feed(&paddle_mobile, {1, 1, 64, 72}, "reliable"); - paddle_mobile.Predict(); - -#ifdef PADDLE_MOBILE_CL - for (auto var_name : var_names) { - auto cl_image = paddle_mobile.FetchImage(var_name); - if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) { - continue; - } - auto len = cl_image->numel(); - if (len == 0) { - continue; - } - size_t width = cl_image->ImageDims()[0]; - size_t height = cl_image->ImageDims()[1]; - paddle_mobile::framework::half_t *image_data = - new paddle_mobile::framework::half_t[height * width * 4]; - cl_int err; - cl_mem image = cl_image->GetCLImage(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - err = clEnqueueReadImage(cl_image->CommandQueue(), image, CL_TRUE, origin, - region, 0, 0, image_data, 0, NULL, NULL); - CL_CHECK_ERRORS(err); - float *tensor_data = new float[cl_image->numel()]; - auto converter = cl_image->Converter(); - converter->ImageToNCHW(image_data, tensor_data, cl_image->ImageDims(), - cl_image->dims()); - - auto data = tensor_data; - std::string sample = ""; - if (check_shape) { - for (int i = 0; i < cl_image->dims().size(); i++) { - sample += " " + std::to_string(cl_image->dims()[i]); - } - } - if (!is_sample_step) { - sample_step = len / sample_num; - } - if (sample_step <= 0) { - sample_step = 1; - } - for (int i = 0; i < len; i += sample_step) { - sample += " " + std::to_string(data[i]); - } - std::cout << "auto-test" - << " var " << var_name << sample << std::endl; - } -#else - for (auto var_name : var_names) { - auto out = paddle_mobile.Fetch(var_name); - auto len = out->numel(); - if (len == 0) { - continue; - } - if (out->memory_size() == 0) { - continue; - } - if (out->type() == type_id()) { - auto data = out->data(); - std::string sample = ""; - if (check_shape) { - for (int i = 0; i < out->dims().size(); i++) { - sample += " " + std::to_string(out->dims()[i]); - } - } - if (!is_sample_step) { - sample_step = len / sample_num; - } - if (sample_step <= 0) { - sample_step = 1; - } - for (int i = 0; i < len; i += sample_step) { - sample += " " + std::to_string(data[i]); - } - std::cout << "auto-test" - << " var " << var_name << sample << std::endl; - } else if (out->type() == type_id()) { - auto data = out->data(); - std::string sample = ""; - if (check_shape) { - for (int i = 0; i < out->dims().size(); i++) { - sample += " " + std::to_string(out->dims()[i]); - } - } - if (!is_sample_step) { - sample_step = len / sample_num; - } - if (sample_step <= 0) { - sample_step = 1; - } - for (int i = 0; i < len; i += sample_step) { - sample += " " + std::to_string(data[i]); - } - std::cout << "auto-test" - << " var " << var_name << sample << std::endl; - } - } -#endif - std::cout << std::endl; - } -} -#else -int main() {} -#endif diff --git a/mobile/test/net/test_net_performance.cpp b/mobile/test/net/test_net_performance.cpp deleted file mode 100644 index ac4c71588b..0000000000 --- a/mobile/test/net/test_net_performance.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include "../test_helper.h" -#include "../test_include.h" -void test(int argc, char *argv[]); - -int main(int argc, char *argv[]) { - test(argc, argv); - return 0; -} - -void test(int argc, char *argv[]) { - int arg_index = 1; - bool fuse = std::stoi(argv[arg_index]) == 1; - arg_index++; - bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1; - arg_index++; - bool quantification = std::stoi(argv[arg_index]) == 1; - arg_index++; - int quantification_fold = std::stoi(argv[arg_index]); - arg_index++; - paddle_mobile::PaddleMobileConfigInternal config; - config.memory_optimization_level = enable_memory_optimization - ? MemoryOptimizationWithoutFeeds - : NoMemoryOptimization; - - // save obfuscated model - // config.model_obfuscate_key = "asdf"; - // std::ofstream out_file("new-params", std::ofstream::binary); - // char *out_data = ReadFileToBuff("./checked_model/params"); - // int len = GetFileLength("./checked_model/params"); - // out_file.write(out_data, len); - // out_file.close(); - -#ifdef PADDLE_MOBILE_CL - // config.load_when_predict = true; - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetCLPath("/data/local/tmp/bin"); - std::cout << "testing opencl performance " << std::endl; -#else - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetThreadNum(1); - std::cout << "testing cpu performance " << std::endl; -#endif - - int dim_count = std::stoi(argv[arg_index]); - arg_index++; - int size = 1; - std::vector dims; - for (int i = 0; i < dim_count; i++) { - int64_t dim = std::stoi(argv[arg_index + i]); - size *= dim; - dims.push_back(dim); - } - arg_index += dim_count; - - bool is_lod = std::stoi(argv[arg_index]) == 1; - arg_index++; - paddle_mobile::framework::LoD lod{{}}; - if (is_lod) { - int lod_count = std::stoi(argv[arg_index]); - arg_index++; - for (int i = 0; i < lod_count; i++) { - int dim = std::stoi(argv[arg_index + i]); - lod[0].push_back(dim); - } - arg_index += lod_count; - } - - int var_count = std::stoi(argv[arg_index]); - arg_index++; - bool is_sample_step = std::stoi(argv[arg_index]) == 1; - arg_index++; - int sample_arg = std::stoi(argv[arg_index]); - int sample_step = sample_arg; - int sample_num = sample_arg; - arg_index++; - std::vector var_names; - for (int i = 0; i < var_count; i++) { - std::string var_name = argv[arg_index + i]; - var_names.push_back(var_name); - } - arg_index += var_count; - bool check_shape = std::stoi(argv[arg_index]) == 1; - arg_index++; - - int run_times = std::stoi(argv[arg_index]); - arg_index++; - - bool warm_up = std::stoi(argv[arg_index]) == 1; - arg_index++; - - auto time1 = time(); - if (paddle_mobile.Load("./checked_model/model", "./checked_model/params", - fuse, quantification, 1, is_lod, - quantification_fold)) { - auto time2 = time(); - std::cout << "auto-test" - << " load-time-cost :" << time_diff(time1, time2) << "ms" - << std::endl; - - float *input_data_array = new float[size]; - std::ifstream in("input.txt", std::ios::in); - for (int i = 0; i < size; i++) { - float num; - in >> num; - input_data_array[i] = num; - } - in.close(); - - auto time3 = time(); - - paddle_mobile::framework::Tensor input_tensor( - input_data_array, paddle_mobile::framework::make_ddim(dims)); - auto time4 = time(); - std::cout << "auto-test" - << " preprocess-time-cost :" << time_diff(time3, time4) << "ms" - << std::endl; - - paddle_mobile::framework::LoDTensor input_lod_tensor; - if (is_lod) { - input_lod_tensor.Resize(paddle_mobile::framework::make_ddim(dims)); - input_lod_tensor.set_lod(lod); - auto *tensor_data = input_lod_tensor.mutable_data(); - for (int i = 0; i < size; i++) { - tensor_data[i] = input_data_array[i]; - } - } - - // 预热10次 - if (warm_up) { - for (int i = 0; i < 10; i++) { - if (is_lod) { - auto out = paddle_mobile.Predict(input_lod_tensor); - } else { - paddle_mobile.Feed(var_names[0], input_tensor); - paddle_mobile.Predict(); - } - } - } - - // 测速 - auto max_time = -1; - auto min_time = 100000; - auto all_time = 0; - if (is_lod) { - for (int i = 0; i < run_times; i++) { - auto time7 = time(); - paddle_mobile.Predict(input_lod_tensor); - auto time8 = time(); - const double diff_time_single = time_diff(time7, time8); - max_time = fmax(diff_time_single, max_time); - min_time = fmin(diff_time_single, min_time); - all_time += diff_time_single; - } - } else { - paddle_mobile.Feed(var_names[0], input_tensor); - for (int i = 0; i < run_times; i++) { - auto time7 = time(); - paddle_mobile.Predict(); - auto time8 = time(); - usleep(1000 * quantification_fold); - const double diff_time_single = time_diff(time7, time8); - max_time = fmax(diff_time_single, max_time); - min_time = fmin(diff_time_single, min_time); - all_time += diff_time_single; - } - } - - std::cout << "auto-test" - << " predict-time-cost-avg " << all_time * 1.0f / run_times - << "ms" << std::endl; - std::cout << "auto-test" - << " predict-time-cost-max " << double(max_time) << "ms" - << std::endl; - std::cout << "auto-test" - << " predict-time-cost-min " << double(min_time) << "ms" - << std::endl; - - std::cout << std::endl; - } -} diff --git a/mobile/test/net/test_nlp.cpp b/mobile/test/net/test_nlp.cpp deleted file mode 100644 index db13e2da57..0000000000 --- a/mobile/test/net/test_nlp.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", - // std::string(g_mobilenet_detect) + "/params", true); - - auto isok = paddle_mobile.Load(g_nlp, true, false, 1, true); - - // auto isok = paddle_mobile.Load(std::string(g_nlp) + "/model", - // std::string(g_nlp) + "/params", false); - if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - // 1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479 - - std::vector ids{1918, 117, 55, 97, 1352, 4272, 1656, 903}; - - paddle_mobile::framework::LoDTensor words; - auto size = static_cast(ids.size()); - paddle_mobile::framework::LoD lod{{0, ids.size()}}; - DDim dims{size, 1}; - words.Resize(dims); - words.set_lod(lod); - DLOG << "words lod : " << words.lod(); - auto *pdata = words.mutable_data(); - size_t n = words.numel() * sizeof(int64_t); - DLOG << "n :" << n; - memcpy(pdata, ids.data(), n); - DLOG << "words lod 22: " << words.lod(); - auto time3 = time(); - for (int i = 0; i < 1; ++i) { - paddle_mobile.Predict(words); - DLOG << *paddle_mobile.Fetch(); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms" - << std::endl; - } - - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - // 1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479 - - std::vector ids{ - 2084, 635, 1035, 197, 990, 150, 1132, 2403, 546, 770, 4060, 3352, - 1798, 1589, 1352, 98, 136, 3461, 3186, 1159, 515, 764, 278, 1178, - 5044, 4060, 943, 932, 463, 1198, 3352, 374, 1198, 3352, 374, 2047, - 1069, 1589, 3672, 1178, 1178, 2165, 1178, 2084, 635, 3087, 2236, 546, - 2047, 1549, 546, 2047, 302, 2202, 398, 804, 397, 657, 804, 866, - 932, 2084, 515, 2165, 397, 302, 2202, 526, 992, 906, 1215, 1589, - 4493, 2403, 723, 932, 2084, 635, 1352, 932, 444, 2047, 1159, 1893, - 1579, 59, 330, 98, 1296, 1159, 3430, 738, 3186, 1071, 2174, 3933}; - - paddle_mobile::framework::LoDTensor words; - auto size = static_cast(ids.size()); - paddle_mobile::framework::LoD lod{{0, ids.size()}}; - DDim dims{size, 1}; - words.Resize(dims); - words.set_lod(lod); - DLOG << "words lod : " << words.lod(); - auto *pdata = words.mutable_data(); - size_t n = words.numel() * sizeof(int64_t); - DLOG << "n :" << n; - memcpy(pdata, ids.data(), n); - DLOG << "words lod 22: " << words.lod(); - auto time3 = time(); - for (int i = 0; i < 1; ++i) { - paddle_mobile.Predict(words); - DLOG << *paddle_mobile.Fetch(); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms" - << std::endl; - return 0; -} diff --git a/mobile/test/net/test_ocr.cpp b/mobile/test/net/test_ocr.cpp deleted file mode 100644 index d7dde5406e..0000000000 --- a/mobile/test/net/test_ocr.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_helper.h" -#include "../test_include.h" - -const int max_run_times = 10; - -int main(int argc, char **argv) { - if (argc < 3) { - std::cerr - << "Usage: ./test_ocr [detect_model_dir|recog_model_dir] image_path" - << std::endl; - return 1; - } - std::string model_dir = argv[1]; - std::string image_path = argv[2]; - - // init input, output params - std::vector input_vec; - std::vector input_shape; - std::vector output_fetch_nodes; - int PRINT_NODE_ELEM_NUM = 10; - - bool is_det_model = model_dir.find("detect") != string::npos; - if (is_det_model) { - input_shape.emplace_back(1); - input_shape.emplace_back(3); - input_shape.emplace_back(512); - input_shape.emplace_back(512); - output_fetch_nodes.emplace_back("sigmoid_0.tmp_0"); - output_fetch_nodes.emplace_back("tmp_5"); - } else { - input_shape.emplace_back(1); - input_shape.emplace_back(3); - input_shape.emplace_back(48); - input_shape.emplace_back(512); - output_fetch_nodes.emplace_back("top_k_1.tmp_0"); - output_fetch_nodes.emplace_back("cast_330.tmp_0"); - } - std::shared_ptr outputs[output_fetch_nodes.size()]; - - // init paddle instance - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(1); - std::cout << "start load " << std::endl; - auto load_success = paddle_mobile.Load(std::string(model_dir) + "/model", - std::string(model_dir) + "/params", - true, false, 1, true); - std::cout << "load_success:" << load_success << std::endl; - // input image raw tensor, generated by - // [scripts](tools/python/imagetools/img2nchw.py) - std::cout << "image_path: " << image_path << std::endl; - std::cout << "input_shape: " << input_shape[0] << ", " << input_shape[1] - << ", " << input_shape[2] << ", " << input_shape[3] << std::endl; - GetInput(image_path, &input_vec, input_shape); - - // model predict - auto pred_start_time = paddle_mobile::time(); - for (int run_idx = 0; run_idx < max_run_times; ++run_idx) { - paddle_mobile.Predict(input_vec, input_shape); - for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) { - auto fetch_name = output_fetch_nodes[out_idx]; - outputs[out_idx] = paddle_mobile.Fetch(fetch_name); - } - } - auto pred_end_time = paddle_mobile::time(); - - // inference time - double pred_time = - paddle_mobile::time_diff(pred_start_time, pred_end_time) / max_run_times; - std::cout << "predict time(ms): " << pred_time << std::endl; - - // output result - for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) { - std::string node_id = output_fetch_nodes[out_idx]; - auto node_lod_tensor = outputs[out_idx]; - int node_elem_num = node_lod_tensor->numel(); - float *node_ptr = node_lod_tensor->data(); - std::cout << "==== output_fetch_nodes[" << out_idx - << "] =====" << std::endl; - std::cout << "node_id: " << node_id << std::endl; - std::cout << "node_elem_num: " << node_elem_num << std::endl; - std::cout << "PRINT_NODE_ELEM_NUM: " << PRINT_NODE_ELEM_NUM << std::endl; - PRINT_NODE_ELEM_NUM = - (node_elem_num > PRINT_NODE_ELEM_NUM) ? PRINT_NODE_ELEM_NUM : 0; - for (int eidx = 0; eidx < PRINT_NODE_ELEM_NUM; ++eidx) { - std::cout << node_id << "[" << eidx << "]: " << node_ptr[eidx] - << std::endl; - } - std::cout << std::endl; - } - - return 0; -} diff --git a/mobile/test/net/test_op_in_net.cpp b/mobile/test/net/test_op_in_net.cpp deleted file mode 100644 index 9425c02762..0000000000 --- a/mobile/test/net/test_op_in_net.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "../test_helper.h" -#include "../test_include.h" - -void test(int argc, char *argv[]); - -int main(int argc, char *argv[]) { - test(argc, argv); - return 0; -} - -void test(int argc, char *argv[]) { - std::vector dims{1, 8, 32, 32}; - int op_index = 2; - std::string input_var_name = "ConvNdBackward2.conv2d.output.1.tmp_0"; - std::vector output_var_names{ - "ConvNdBackward2.conv2d.output.1.tmp_1"}; - - bool fuse = false; - bool enable_memory_optimization = true; - paddle_mobile::PaddleMobileConfigInternal config; - config.memory_optimization_level = enable_memory_optimization - ? MemoryOptimizationWithoutFeeds - : NoMemoryOptimization; -#ifdef PADDLE_MOBILE_CL - // config.load_when_predict = true; - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetCLPath("/data/local/tmp/bin"); -#else - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetThreadNum(1); -#endif - - int size = 1; - for (int i = 0; i < dims.size(); i++) { - size *= dims[i]; - } - - bool is_sample_step = false; - int sample_step = 1; - int sample_num = 20; - - auto time1 = time(); - if (paddle_mobile.Load("./checked_model/model", "./checked_model/params", - fuse, false, 1, true, 1)) { - auto time2 = time(); - std::cout << "auto-test" - << " load-time-cost :" << time_diff(time1, time2) << "ms" - << std::endl; - - float input_data_array[size]; - std::ifstream in("input.txt", std::ios::in); - for (int i = 0; i < size; i++) { - float num; - in >> num; - input_data_array[i] = num; - } - in.close(); - - auto time3 = time(); - std::vector input_data; - for (int i = 0; i < size; i++) { - float num = input_data_array[i]; - input_data.push_back(num); - } - paddle_mobile::framework::Tensor input_tensor( - input_data, paddle_mobile::framework::make_ddim(dims)); - auto time4 = time(); - std::cout << "auto-test" - << " preprocess-time-cost :" << time_diff(time3, time4) << "ms" - << std::endl; - - // 测试正确性 - // 以下代码依赖paddle_mobile.h及executor.h的属性可见性,如需使用,调整可见性后,放开注释 - // auto *input_var = - // paddle_mobile.executor_->program_.scope->FindVar(input_var_name); - // framework::LoDTensor *target = - // input_var->template GetMutable(); - // target->Resize(input_tensor.dims()); - // target->ShareDataWith(input_tensor); - // paddle_mobile.executor_->ops_of_block0_[op_index]->InferShape(); - // paddle_mobile.executor_->ops_of_block0_[op_index]->Run(); - - for (auto var_name : output_var_names) { - auto out = paddle_mobile.Fetch(var_name); - auto len = out->numel(); - if (len == 0) { - continue; - } - if (out->memory_size() == 0) { - continue; - } - auto data = out->data(); - std::string sample = ""; - if (!is_sample_step) { - sample_step = len / sample_num; - } - if (sample_step <= 0) { - sample_step = 1; - } - for (int i = 0; i < len; i += sample_step) { - sample += " " + std::to_string(data[i]); - } - std::cout << "auto-test" - << " var " << var_name << sample << std::endl; - } - std::cout << std::endl; - } -} diff --git a/mobile/test/net/test_resnet.cpp b/mobile/test/net/test_resnet.cpp deleted file mode 100644 index 9c60bd13cf..0000000000 --- a/mobile/test/net/test_resnet.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { -#ifdef PADDLE_MOBILE_FPGA - paddle_mobile::PaddleMobile paddle_mobile; -#endif - -#ifdef PADDLE_MOBILE_CL - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetCLPath("/data/local/tmp/bin"); -#else - paddle_mobile::PaddleMobile paddle_mobile; -#endif - paddle_mobile.SetThreadNum(4); - auto time1 = time(); - if (paddle_mobile.Load(g_resnet, true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - std::vector dims{1, 3, 32, 32}; - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 32, 32}, static_cast(0), - static_cast(1)); - - std::vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); -#ifndef PADDLE_MOBILE_FPGA - // 预热十次 - // for (int i = 0; i < 10; ++i) { - // paddle_mobile.Predict(input, dims); - // } - auto time3 = time(); - // for (int i = 0; i < 10; ++i) { - paddle_mobile.Predict(input, dims); - // } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) << "ms" - << std::endl; - -#else - auto time3 = time(); - paddle_mobile.FeedData(input_tensor); - paddle_mobile.Predict_To(-1); - /*paddle_mobile.Predict_From(10); - auto tensor_ptr = paddle_mobile.FetchResult(9); - std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel() - << std::endl; - auto result_ptr = paddle_mobile.FetchResult(); - std::cout << "Result tensor element number: " << result_ptr->numel() - << std::endl; - - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) << "ms" - << std::endl;*/ -#endif - } - return 0; -} diff --git a/mobile/test/net/test_squeezenet.cpp b/mobile/test/net/test_squeezenet.cpp deleted file mode 100644 index 02ec8691fe..0000000000 --- a/mobile/test/net/test_squeezenet.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - // ../../../test/models/googlenet - // ../../../test/models/mobilenet - auto time1 = time(); - if (paddle_mobile.Load(g_squeezenet, true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - std::vector dims{1, 3, 227, 227}; - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 227, 227}, static_cast(0), - static_cast(1)); - - std::vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); - // 预热十次 - for (int i = 0; i < 10; ++i) { - paddle_mobile.Predict(input, dims); - } - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - - return 0; -} diff --git a/mobile/test/net/test_super.cpp b/mobile/test/net/test_super.cpp deleted file mode 100644 index 669859f622..0000000000 --- a/mobile/test/net/test_super.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../../src/common/types.h" -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobileConfigInternal config; - config.load_when_predict = true; - -#ifdef PADDLE_MOBILE_CL - paddle_mobile::PaddleMobile paddle_mobile(config); - paddle_mobile.SetCLPath("/data/local/tmp/bin"); -#else - paddle_mobile::PaddleMobile paddle_mobile; -#endif - // paddle_mobile.SetThreadNum(4); - - int max = 10; - auto time1 = paddle_mobile::time(); - auto isok = paddle_mobile.Load(std::string(g_super) + "/model", - std::string(g_super) + "/params", true, false, - 1, false); - - if (isok) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" - << std::endl; - - // 300 * 300 - std::vector input; - std::vector dims{1, 1, 300, 300}; - GetInput(g_test_image_1x3x224x224, &input, dims); - paddle_mobile.Predict(input, dims); - - // 640 * 360 (360P) - std::vector input1; - std::vector dims1{1, 1, 640, 360}; - GetInput(g_test_image_1x3x224x224, &input1, dims1); - auto time3 = paddle_mobile::time(); - for (int i = 0; i < max; ++i) { - auto time1 = paddle_mobile::time(); - paddle_mobile.Predict(input1, dims1); - auto time2 = paddle_mobile::time(); - std::cout << "640 * 360 predict cost :第" << i << ": " - << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl; - } - auto time4 = paddle_mobile::time(); - std::cout << "640 * 360 predict cost :" - << paddle_mobile::time_diff(time3, time4) / max << "ms" - << std::endl; - - // 720 * 480 (480P) - std::vector input2; - std::vector dims2{1, 1, 720, 480}; - GetInput(g_test_image_1x3x224x224, &input2, dims2); - auto time5 = paddle_mobile::time(); - for (int i = 0; i < max; ++i) { - auto time1 = paddle_mobile::time(); - paddle_mobile.Predict(input2, dims2); - auto time2 = paddle_mobile::time(); - std::cout << "720 * 480 predict cost :第" << i << ": " - << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl; - } - auto time6 = paddle_mobile::time(); - std::cout << "720 * 480 predict cost :" - << paddle_mobile::time_diff(time5, time6) / max << "ms" - << std::endl; - - // 1024 * 576 (576P) - std::vector input3; - std::vector dims3{1, 1, 1024, 576}; - GetInput(g_test_image_1x3x224x224, &input3, dims3); - auto time7 = paddle_mobile::time(); - for (int i = 0; i < max; ++i) { - auto time1 = paddle_mobile::time(); - paddle_mobile.Predict(input3, dims3); - auto time2 = paddle_mobile::time(); - std::cout << "1024 * 576 predict cost :第" << i << ": " - << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl; - } - auto time8 = paddle_mobile::time(); - std::cout << "1024 * 576 predict cost :" - << paddle_mobile::time_diff(time7, time8) / max << "ms" - << std::endl; - - // 1280 * 720 - std::vector input4; - std::vector dims4{1, 1, 1280, 720}; - GetInput(g_test_image_1x3x224x224, &input4, dims4); - auto time9 = paddle_mobile::time(); - for (int i = 0; i < max; ++i) { - auto time1 = paddle_mobile::time(); - paddle_mobile.Predict(input4, dims4); - auto time2 = paddle_mobile::time(); - std::cout << "1280 * 720 predict cost :第" << i << ": " - << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl; - } - auto time10 = paddle_mobile::time(); - std::cout << "1280 * 720 predict cost :" - << paddle_mobile::time_diff(time9, time10) / max << "ms" - << std::endl; - } - - return 0; -} diff --git a/mobile/test/net/test_vgg16ssd.cpp b/mobile/test/net/test_vgg16ssd.cpp deleted file mode 100644 index 387d6f38ea..0000000000 --- a/mobile/test/net/test_vgg16ssd.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(1); - auto time1 = paddle_mobile::time(); - - auto isok = - paddle_mobile.Load(std::string(g_vgg16_ssd_combined) + "/model", - std::string(g_vgg16_ssd_combined) + "/params", false); - if (isok) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" - << std::endl; - - std::vector dims{1, 3, 300, 300}; - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 300, 300}, static_cast(0), - static_cast(1)); - - std::vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); - - auto vec_result = paddle_mobile.Predict(input, dims); - - DLOG << vec_result; - } - - return 0; -} diff --git a/mobile/test/net/test_wrap.cpp b/mobile/test/net/test_wrap.cpp deleted file mode 100644 index 69f3e785e8..0000000000 --- a/mobile/test/net/test_wrap.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include "io/paddle_mobile_wrap.h" - -int main(int argc, char *argv[]) { -#ifndef PADDLE_MOBILE_FPGA - paddle_mobile::wrap::Net *net = - new paddle_mobile::wrap::Net(paddle_mobile::wrap::kGPU_CL); - net->SetCLPath("/data/local/tmp/bin"); - net->Load("./checked_model/model", "./checked_model/params", false, false, 1, - true); - int size = 1 * 3 * 416 * 416; - std::vector shape{1, 3, 416, 416}; - float *data = new float[size]; - for (int i = 0; i < size; i++) { - data[i] = 0.0; - } - std::ifstream infile; - infile.open("input.txt"); - for (int i = 0; i < size; i++) { - infile >> data[i]; - } - infile.close(); - // input as vector - // std::vector data_as_vector(data, data + size); - // auto output = net->Predict(data_as_vector, shape); - // for (auto item : output) { - // std::cout << item << std::endl; - // } - // input as float pointer - paddle_mobile::wrap::Tensor input(data, - paddle_mobile::wrap::make_ddim(shape)); - net->Feed("image", input); - net->Predict(); - auto output = net->Fetch("save_infer_model/scale_0"); - int output_size = 1; - std::cout << "output shape: "; - for (int i = 0; i < output->dims().size(); i++) { - std::cout << output->dims()[i] << " "; - output_size *= output->dims()[i]; - } - std::cout << std::endl; - std::cout << "output data: "; - for (int i = 0; i < output_size; i++) { - std::cout << output->data()[i] << std::endl; - } -#endif - return 0; -} diff --git a/mobile/test/net/test_yolo.cpp b/mobile/test/net/test_yolo.cpp deleted file mode 100644 index 40aabe92f1..0000000000 --- a/mobile/test/net/test_yolo.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - // ../../../test/models/googlenet - // ../../../test/models/mobilenet - auto time1 = time(); - if (paddle_mobile.Load(std::string(g_yolo) + "/model", - std::string(g_yolo) + "/params", true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; - - std::vector dims{1, 3, 227, 227}; - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 227, 227}, static_cast(0), - static_cast(1)); - - std::vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); - // 预热十次 - for (int i = 0; i < 10; ++i) { - paddle_mobile.Predict(input, dims); - } - auto time3 = time(); - for (int i = 0; i < 10; ++i) { - paddle_mobile.Predict(input, dims); - } - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - return 0; -} diff --git a/mobile/test/net/test_yolo_combined.cpp b/mobile/test/net/test_yolo_combined.cpp deleted file mode 100644 index 5a589878cc..0000000000 --- a/mobile/test/net/test_yolo_combined.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - // ../../../test/models/googlenet - // ../../../test/models/mobilenet - auto time1 = time(); - - if (paddle_mobile.Load(std::string(g_yolo_vision) + "/model", - std::string(g_yolo_vision) + "/params", true)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - std::vector dims{1, 3, 416, 416}; - std::vector input; - - GetInput(g_test_image_1x3x416x416_vision_yolo_input, &input, dims); - std::cout << "input.size(): " << input.size() << std::endl; - for (int j = 0; j < 100; ++j) { - std::cout << j << " : " << input[j] << std::endl; - } - // // 预热十次 - // for (int i = 0; i < 10; ++i) { - // paddle_mobile.Predict(input, dims); - // } - auto time3 = time(); - const vector vector_out = paddle_mobile.Predict(input, dims); - - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; - - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; - } - return 0; -} diff --git a/mobile/test/net/test_yologpu.cpp b/mobile/test/net/test_yologpu.cpp deleted file mode 100644 index 37f4a78019..0000000000 --- a/mobile/test/net/test_yologpu.cpp +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include // NOLINT -#include "../../src/common/types.h" -#include "../../src/io/paddle_test_inference_api.h" -#include "../test_helper.h" -#include "../test_include.h" -void t1() { - paddle_mobile::PaddleMobile paddle_mobile_gpu; - paddle_mobile::PaddleMobile paddle_mobile_cpu; - paddle_mobile::PaddleTester paddle_test_cpu; - paddle_mobile::PaddleTester paddle_test_gpu; - printf("cpu time:%f\n", paddle_test_cpu.CaculatePredictTime()); - std::string path = "/data/local/tmp/bin"; - printf("gpu time:%f\n", paddle_test_gpu.CaculatePredictTime(&path)); - // paddle_mobile.SetThreadNum(4); -#ifdef PADDLE_MOBILE_CL - paddle_mobile_gpu.SetCLPath("/data/local/tmp/bin"); -#endif - auto time1 = paddle_mobile::time(); - auto isok = - paddle_mobile_gpu.Load(std::string(g_yolo_vision) + "/model", - std::string(g_yolo_vision) + "/params", true); - - // auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); - if (isok) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" - << std::endl; - - std::vector input; - std::vector dims{1, 3, 416, 416}; - GetInput(g_test_image_1x3x416x416_vision_yolo_input, &input, dims); - - std::vector vec_result; - // = paddle_mobile.Predict(input, dims); - - auto time3 = paddle_mobile::time(); - int max = 1; - for (int i = 0; i < max; ++i) { - vec_result = paddle_mobile_gpu.Predict(input, dims); - } - auto time4 = paddle_mobile::time(); - - // auto time3 = paddle_mobile::time(); - - // for (int i = 0; i < 10; ++i) { - // auto vec_result = paddle_mobile.Predict(input, dims); - // } - - // auto time4 = paddle_mobile::time(); - - std::cout << "predict cost :" - << paddle_mobile::time_diff(time3, time4) / max << "ms" - << std::endl; - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - // for (float i : vec_result) { - // std::cout << i << std::endl; - // } - } -} - -void t2() { - paddle_mobile::PaddleMobile paddle_mobile; - // paddle_mobile.SetThreadNum(4); -#ifdef PADDLE_MOBILE_CL - paddle_mobile.SetCLPath("/data/local/tmp/bin"); -#endif - auto time1 = paddle_mobile::time(); - auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model", - std::string(g_yolo_mul) + "/params", true); - - // auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); - if (isok) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" - << std::endl; - - std::vector input; - std::vector dims{1, 3, 416, 416}; - GetInput(g_yolo_img, &input, dims); - - std::vector vec_result; - // = paddle_mobile.Predict(input, dims); - - auto time3 = paddle_mobile::time(); - int max = 10; - for (int i = 0; i < max; ++i) { - vec_result = paddle_mobile.Predict(input, dims); - } - auto time4 = paddle_mobile::time(); - - // auto time3 = paddle_mobile::time(); - - // for (int i = 0; i < 10; ++i) { - // auto vec_result = paddle_mobile.Predict(input, dims); - // } - - // auto time4 = paddle_mobile::time(); - - std::cout << "predict cost :" - << paddle_mobile::time_diff(time3, time4) / max << "ms" - << std::endl; - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - // for (float i : vec_result) { - // std::cout << i << std::endl; - // } - } -} - -void t3() { - paddle_mobile::PaddleMobile paddle_mobile; - // paddle_mobile.SetThreadNum(4); - // #ifdef PADDLE_MOBILE_CL - // paddle_mobile.SetCLPath("/data/local/tmp/bin"); - // #endif - auto time1 = paddle_mobile::time(); - auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model", - std::string(g_yolo_mul) + "/params", true); - - // auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); - if (isok) { - auto time2 = paddle_mobile::time(); - std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" - << std::endl; - - std::vector input; - std::vector dims{1, 3, 416, 416}; - GetInput(g_yolo_img, &input, dims); - - std::vector vec_result = paddle_mobile.Predict(input, dims); - - auto time3 = paddle_mobile::time(); - int max = 10; - for (int i = 0; i < max; ++i) { - vec_result = paddle_mobile.Predict(input, dims); - } - auto time4 = paddle_mobile::time(); - - // auto time3 = paddle_mobile::time(); - - // for (int i = 0; i < 10; ++i) { - // auto vec_result = paddle_mobile.Predict(input, dims); - // } - - // auto time4 = paddle_mobile::time(); - - std::cout << "predict cost :" - << paddle_mobile::time_diff(time3, time4) / max << "ms" - << std::endl; - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - // for (float i : vec_result) { - // std::cout << i << std::endl; - // } - } -} - -int main() { - // std::thread th1(t1); - // std::thread th2(t2); - // std::thread th3(t3); - std::thread th1(t1); - // th1.join(); - // th2.join(); - // th3.join(); - th1.join(); - return 0; -} diff --git a/mobile/test/operators/test_batchnorm_op.cpp b/mobile/test/operators/test_batchnorm_op.cpp deleted file mode 100644 index 92cb7157c1..0000000000 --- a/mobile/test/operators/test_batchnorm_op.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/batchnorm_op.h" - -namespace paddle_mobile { - -void BatchNorm(const framework::Tensor *X, const framework::Tensor *Mean, - const framework::Tensor *Var, const framework::Tensor *Scale, - const framework::Tensor *Bias, const float eps, - framework::Tensor *Y) { - const float *x = X->data(); - const float *m = Mean->data(); - const float *v = Var->data(); - const float *s = Scale->data(); - const float *b = Bias->data(); - float *y = Y->mutable_data(); - - int batch_size = X->dims()[0]; - int channel = X->dims()[1]; - int hw = X->dims()[2] * X->dims()[3]; - - for (int batch = 0; batch < batch_size; ++batch) { - for (int c = 0; c < channel; ++c) { - float mean = m[c]; - float inv_var = 1.f / std::sqrt(v[c] + eps); - float scale = s[c]; - float bias = b[c]; - const float *input = x + (batch * channel + c) * hw; - float *output = y + (batch * channel + c) * hw; - for (int j = 0; j < hw; ++j) { - output[j] = scale * ((input[j] - mean) * inv_var) + bias; - } - } - } -} - -int TestBatchNormOp(const std::vector input_shape) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - inputs["Mean"] = std::vector({"mean"}); - inputs["Variance"] = std::vector({"variance"}); - inputs["Scale"] = std::vector({"scale"}); - inputs["Bias"] = std::vector({"bias"}); - outputs["Y"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - - auto mean_var = scope.get()->Var("mean"); - auto mean = mean_var->template GetMutable(); - SetupTensor(mean, framework::make_ddim({input_shape[1]}), -10.0, 10.0); - - auto vari_var = scope.get()->Var("variance"); - auto vari = vari_var->template GetMutable(); - SetupTensor(vari, framework::make_ddim({input_shape[1]}), -10.0, 10.0); - - auto scale_var = scope.get()->Var("scale"); - auto scale = scale_var->template GetMutable(); - SetupTensor(scale, framework::make_ddim({input_shape[1]}), -10.0, - 10.0); - - auto bias_var = scope.get()->Var("bias"); - auto bias = bias_var->template GetMutable(); - SetupTensor(bias, framework::make_ddim({input_shape[1]}), -10.0, 10.0); - - auto output_var = scope.get()->Var("output"); - - float eps = 1e-6; - framework::AttributeMap attrs; - attrs["epsilon"].Set(eps); - attrs["momentum"].Set(0.f); - - auto *op = new operators::BatchNormOp( - "batch_norm", inputs, outputs, attrs, scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - BatchNorm(input, mean, vari, scale, bias, eps, &output_cmp); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } -} - -} // namespace paddle_mobile - -int main() { - TestBatchNormOp({1, 1, 10, 10}); - TestBatchNormOp({1, 32, 100, 100}); - return 0; -} diff --git a/mobile/test/operators/test_box_coder_op.cpp b/mobile/test/operators/test_box_coder_op.cpp deleted file mode 100644 index 39b8257e66..0000000000 --- a/mobile/test/operators/test_box_coder_op.cpp +++ /dev/null @@ -1,196 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/box_coder_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestBoxCoderOp { - public: - explicit TestBoxCoderOp(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - - const std::vector> blocks = - to_predict_program_->Blocks(); - // DLOG << " **block size " << blocks.size(); - for (auto block_desc : blocks) { - std::vector> ops = block_desc->Ops(); - // DLOG << " ops " << ops.size(); - for (auto op : ops) { - if (op->Type() == "box_coder" && - op->Input("PriorBox")[0] == "concat_0.tmp_0") { - DLOG << " mul attr size: " << op->GetAttrMap().size(); - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " outputs size: " << op->GetOutputs().size(); - DLOG << " Input PriorBox is : " << op->Input("PriorBox")[0]; - DLOG << " Input PriorBoxVar is : " << op->Input("PriorBoxVar")[0]; - DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0]; - DLOG << " OutputBox is : " << op->Output("OutputBox")[0]; - DLOG << " code_type : " - << op->GetAttrMap().at("code_type").GetString(); - std::shared_ptr> boxcoder = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(boxcoder); - } - } - } - } - - std::shared_ptr predict_boxcoder(const Tensor &t1, const Tensor &t2, - const Tensor &t3) { - // feed - auto scope = program_.scope.get(); - Variable *prior_box = scope->Var("concat_0.tmp_0"); - auto tensor_x1 = prior_box->GetMutable(); - tensor_x1->ShareDataWith(t1); - - Variable *prior_box_var = scope->Var("concat_1.tmp_0"); - auto tensor_x2 = prior_box_var->GetMutable(); - tensor_x2->ShareDataWith(t2); - - Variable *target_box = scope->Var("concat_2.tmp_0"); - auto tensor_x3 = target_box->GetMutable(); - tensor_x3->ShareDataWith(t3); - - Variable *boxes_output = scope->Var("box_coder_0.tmp_0"); - auto *boxes_output_tensor = boxes_output->GetMutable(); - boxes_output_tensor->mutable_data({1, 1917, 4}); - - // DLOG << typeid(output_tensor).name(); - // DLOG << "output_tensor dims: " << output_tensor->dims(); - - std::shared_ptr outbox_tensor = std::make_shared(); - outbox_tensor.reset(boxes_output_tensor); - - predict_boxcoder(t1, t2, t3, 0); - - return outbox_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - - void predict_boxcoder(const Tensor &t1, const Tensor &t2, const Tensor &t3, - int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - DLOG << "op -> run()"; - op->Run(); - } - } -}; - -template class TestBoxCoderOp; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run BoxCoderOp Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_mobilenet_ssd)); - - paddle_mobile::framework::Tensor priorbox; - SetupTensor(&priorbox, {1917, 4}, static_cast(0), - static_cast(1)); - auto *priorbox_ptr = priorbox.data(); - - paddle_mobile::framework::Tensor priorboxvar; - SetupTensor(&priorboxvar, {1917, 4}, static_cast(0.1), - static_cast(0.2)); - auto *priorboxvar_ptr = priorboxvar.data(); - - paddle_mobile::framework::Tensor targetbox; - SetupTensor(&targetbox, {1, 1917, 4}, static_cast(0), - static_cast(1)); - auto *targetbox_ptr = targetbox.data(); - - paddle_mobile::framework::TestBoxCoderOp testBoxCoderOp( - program); - - auto output_boxcoder = - testBoxCoderOp.predict_boxcoder(priorbox, priorboxvar, targetbox); - auto output_boxcoder_ptr = output_boxcoder->data(); - - for (int i = 0; i < output_boxcoder->numel(); i++) { - DLOG << output_boxcoder_ptr[i]; - } - DLOGF("\n"); - /// testing 25th bbox. - DLOG << "PriorBox**************"; - DLOG << priorbox_ptr[100]; - DLOG << priorbox_ptr[101]; - DLOG << priorbox_ptr[102]; - DLOG << priorbox_ptr[103]; - DLOG << "PriorBoxVar**************"; - DLOG << priorboxvar_ptr[100]; - DLOG << priorboxvar_ptr[101]; - DLOG << priorboxvar_ptr[102]; - DLOG << priorboxvar_ptr[103]; - DLOG << "TargetBox***************"; - DLOG << targetbox_ptr[100]; - DLOG << targetbox_ptr[101]; - DLOG << targetbox_ptr[102]; - DLOG << targetbox_ptr[103]; - DLOG << "OutputBox**************"; - DLOG << output_boxcoder_ptr[100]; - DLOG << output_boxcoder_ptr[101]; - DLOG << output_boxcoder_ptr[102]; - DLOG << output_boxcoder_ptr[103]; - - DLOG << "***********----------------------**************"; - auto priorbox_w = priorbox_ptr[102] - priorbox_ptr[100]; - auto priorbox_h = priorbox_ptr[103] - priorbox_ptr[101]; - auto priorbox_center_x = (priorbox_ptr[100] + priorbox_ptr[102]) / 2; - auto priorbox_center_y = (priorbox_ptr[101] + priorbox_ptr[103]) / 2; - DLOG << "prior box width : " << priorbox_w; - DLOG << "prior box height : " << priorbox_h; - DLOG << "prior box center x : " << priorbox_center_x; - DLOG << "prior box center y : " << priorbox_center_y; - auto target_box_center_x = - priorboxvar_ptr[100] * targetbox_ptr[100] * priorbox_w + - priorbox_center_x; - DLOG << "target_box_center_x : " << target_box_center_x; - auto target_box_center_y = - priorboxvar_ptr[101] * targetbox_ptr[101] * priorbox_h + - priorbox_center_y; - DLOG << "target_box_center_y : " << target_box_center_y; - auto target_box_width = - std::exp(priorboxvar_ptr[102] * targetbox_ptr[102]) * priorbox_w; - DLOG << "target_box_width : " << target_box_width; - auto target_box_height = - std::exp(priorboxvar_ptr[103] * targetbox_ptr[103]) * priorbox_h; - DLOG << "target_box_height : " << target_box_height; - DLOG << "pre x min : " << target_box_center_x - target_box_width / 2; - DLOG << "pre y min : " << target_box_center_y - target_box_height / 2; - DLOG << "pre x max : " << target_box_center_x + target_box_width / 2; - DLOG << "pre y max : " << target_box_center_y + target_box_height / 2; - return 0; -} diff --git a/mobile/test/operators/test_cast_op.cpp b/mobile/test/operators/test_cast_op.cpp deleted file mode 100644 index f330e07eaf..0000000000 --- a/mobile/test/operators/test_cast_op.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/cast_op.h" - -namespace paddle_mobile { - -template -void Cast(const framework::Tensor *X, framework::Tensor *Y) { - const Itype *x = X->data(); - Otype *y = Y->mutable_data(); - - for (int i = 0; i < X->numel(); ++i) { - y[i] = static_cast(x[i]); - } -} - -template -int TypeInt() {} -template <> -int TypeInt() { - return 0; -} -template <> -int TypeInt() { - return 2; -} -template <> -int TypeInt() { - return 3; -} -template <> -int TypeInt() { - return 5; -} -template <> -int TypeInt() { - return 6; -} -template <> -int TypeInt() { - return 19; -} -template <> -int TypeInt() { - return 20; -} -template <> -int TypeInt() { - return 21; -} - -template -int TestCastOp(const std::vector input_shape) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, static_cast(-100), - static_cast(100)); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - attrs["in_dtype"].Set(TypeInt()); - attrs["out_dtype"].Set(TypeInt()); - auto *op = new operators::CastOp("cast", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - Otype *output_cmp_data = output_cmp.mutable_data(output->dims()); - Cast(input, &output_cmp); - - const Otype *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main(int argc, char *argv[]) { - TestCastOp({1, 100}); - TestCastOp({128, 100}); - - TestCastOp({1, 100}); - TestCastOp({128, 100}); - - TestCastOp({1, 100}); - TestCastOp({128, 100}); - - TestCastOp({1, 100}); - TestCastOp({128, 100}); - return 0; -} diff --git a/mobile/test/operators/test_concat_op.cpp b/mobile/test/operators/test_concat_op.cpp deleted file mode 100644 index 761d1ac51d..0000000000 --- a/mobile/test/operators/test_concat_op.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/concat_op.h" - -namespace paddle_mobile { -using framework::AttributeMap; -using framework::DDim; -using framework::LoDTensor; -using framework::Scope; -using framework::make_ddim; - -template -void concat(const std::vector &input, LoDTensor *output, int axis) { - int num = input.size(); - - int rows = 1; - auto dim_0 = input[0].dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int out_rows = rows, out_cols = 0; - - std::vector input_cols(input.size()); - for (int i = 0; i < num; ++i) { - int t_cols = input[i].numel() / rows; - out_cols += t_cols; - input_cols[i] = t_cols; - } - - // computation - auto output_data = output->data(); - int col_idx = 0; - for (int j = 0; j < num; ++j) { - int col_len = input_cols[j]; - auto input_data = input[j].data(); - for (int k = 0; k < out_rows; ++k) { - memcpy(output_data + k * out_cols + col_idx, input_data + k * col_len, - sizeof(T) * col_len); - } - col_idx += col_len; - } -} - -template -int TestConcatOP() { - DDim inputA_shape = make_ddim({10, 4, 2, 2}); - DDim inputB_shape = make_ddim({20, 4, 2, 2}); - DDim inputC_shape = make_ddim({30, 4, 2, 2}); - DDim inputD_shape = make_ddim({40, 4, 2, 2}); - DDim output_shape = make_ddim({100, 4, 2, 2}); - int axis_v = 0; - VariableNameMap inputs; - VariableNameMap outputs; - std::vector input_tensors; - auto scope = std::make_shared(); - inputs["X"] = - std::vector({"inputA", "inputB", "inputC", "inputD"}); - outputs["Out"] = std::vector({"output"}); - - auto inputA_var = scope.get()->Var("inputA"); - auto inputA = inputA_var->template GetMutable(); - SetupTensor(inputA, inputA_shape, -127, 127); - input_tensors.push_back(std::move(*inputA)); - - auto inputB_var = scope.get()->Var("inputB"); - auto inputB = inputB_var->template GetMutable(); - SetupTensor(inputB, inputB_shape, -127, 127); - input_tensors.push_back(std::move(*inputB)); - - auto inputC_var = scope.get()->Var("inputC"); - auto inputC = inputC_var->template GetMutable(); - SetupTensor(inputC, inputC_shape, -127, 127); - input_tensors.push_back(std::move(*inputC)); - - auto inputD_var = scope.get()->Var("inputD"); - auto inputD = inputD_var->template GetMutable(); - SetupTensor(inputD, inputD_shape, -127, 127); - input_tensors.push_back(std::move(*inputD)); - - auto output_var = scope.get()->Var("output"); - AttributeMap attrs; - attrs["axis"].Set(axis_v); - - auto *op = new operators::ConcatOp("concat", inputs, outputs, - attrs, scope.get()); - op->InferShape(); - op->Run(); - auto output = output_var->template Get(); - const T *output_data = output->data(); - LoDTensor output_cmp; - output_cmp.mutable_data(output_shape); - concat(input_tensors, &output_cmp, axis_v); - const T *output_cmp_data = output_cmp.data(); - // compare - int eq = 0; - int neq = 0; - for (int i = 0; i < output->numel(); ++i) { - PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], - "The execution of test_concat_op is failed!"); - if (output_data[i] == output_cmp_data[i]) { - ++eq; - } else { - ++neq; - } - } - std::cout << "eq = " << eq << ", neq = " << neq << std::endl; - - delete op; - return 0; -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - paddle_mobile::TestConcatOP(); - paddle_mobile::TestConcatOP(); - return 0; -} diff --git a/mobile/test/operators/test_conv_add_relu_op.cpp b/mobile/test/operators/test_conv_add_relu_op.cpp deleted file mode 100644 index f170719218..0000000000 --- a/mobile/test/operators/test_conv_add_relu_op.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/fusion_conv_add_relu_op.h" - -int main() { - paddle_mobile::framework::Loader loader; - // ../models/image_classification_resnet.inference.model - auto program = loader.Load(g_googlenet, true); - - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); - - Executor4Test< - paddle_mobile::CPU, - paddle_mobile::operators::FusionConvAddReluOp> - executor(program, "fusion_conv_add_relu", true); - - paddle_mobile::framework::Tensor input; - GetInput(g_test_image_1x3x224x224, &input, {1, 3, 224, 224}); - // // use SetupTensor if not has local input image . - // SetupTensor(&input, {1, 3, 224, 224}, static_cast(0), - // static_cast(1)); - - auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112}); - auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim); - - auto output_ptr = output->data(); - for (int j = 0; j < 25; ++j) { - DLOG << " value of output: " << output_ptr[j]; - } - return 0; -} diff --git a/mobile/test/operators/test_conv_bn_relu_op.cpp b/mobile/test/operators/test_conv_bn_relu_op.cpp deleted file mode 100644 index b51bdc0737..0000000000 --- a/mobile/test/operators/test_conv_bn_relu_op.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/fusion_conv_bn_relu_op.h" - -namespace paddle_mobile { - -// Reference convolution from Caffe for checking results. -// accumulate through explicit loops over input, output, and filters. -template -int TestConvBnReluOp(int in_channels, int in_height, int in_width, - int out_channels, int groups, std::string opname) { - int kernel_h = Kernel; - int kernel_w = Kernel; - int pad_h = Pad; - int pad_w = Pad; - int stride_h = Stride; - int stride_w = Stride; - int dilation_h = 1; - int dilation_w = 1; - - int batch_size = 1; - int input_c = in_channels; - int input_h = in_height; - int input_w = in_width; - int output_c = out_channels; - framework::DDim input_shape = - framework::make_ddim({batch_size, input_c, input_h, input_w}); - framework::DDim filter_shape = - framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w}); - framework::DDim shape = framework::make_ddim({output_c}); - - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["Input"] = std::vector({"input"}); - inputs["Filter"] = std::vector({"filter"}); - outputs["Out"] = std::vector({"output"}); - inputs["Mean"] = std::vector({"input_mean"}); - inputs["Variance"] = std::vector({"input_variance"}); - inputs["Scale"] = std::vector({"input_scale"}); - inputs["Bias"] = std::vector({"input_bias"}); - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, input_shape, -20.0, 20.0); - - auto filter_var = scope.get()->Var("filter"); - auto filter = filter_var->template GetMutable(); - SetupTensor(filter, filter_shape, -20, 20); - - auto input_mean_var = scope.get()->Var("input_mean"); - auto input_mean = input_mean_var->template GetMutable(); - SetupTensor(input_mean, shape, -10.0, 10.0); - auto vari_var = scope.get()->Var("input_variance"); - auto vari = vari_var->template GetMutable(); - SetupTensor(vari, shape, -10.0, 10.0); - auto scale_var = scope.get()->Var("input_scale"); - auto scale = scale_var->template GetMutable(); - SetupTensor(scale, shape, -10.0, 10.0); - auto input_bias_var = scope.get()->Var("input_bias"); - auto input_bias = input_bias_var->template GetMutable(); - SetupTensor(input_bias, shape, -10.0, 10.0); - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - attrs["strides"].Set>(std::vector({stride_h, stride_w})); - attrs["paddings"].Set>(std::vector({pad_h, pad_w})); - attrs["dilations"].Set>( - std::vector({dilation_h, dilation_w})); - attrs["groups"].Set(groups); - attrs["epsilon"].Set(1e-6); - attrs["momentum"].Set(0.f); - auto *op = new operators::FusionConvBNReluOp( - "fusion_conv_bn_relu", inputs, outputs, attrs, scope.get()); - op->InferShape(); - op->Init(); - for (int i = 0; i < 10; ++i) { - op->Run(); - } - auto time1 = time(); - for (int i = 0; i < 10; ++i) { - op->Run(); - } - auto time2 = time(); - std::ofstream out_file("./out_conv.txt", std::ios::app); - out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms" - << std::endl; - out_file.close(); - - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main(int argc, char *argv[]) { - // kernel = 3, pad = 1, stride = 2 - paddle_mobile::TestConvBnReluOp(3, 48, 48, 16, 1, - "conv_bn_relu"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp(16, 24, 24, 8, 1, - "depthwise_seperable"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp(8, 24, 24, 24, 1, - "MBConv_3x3_conv1"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp(24, 24, 24, 8, 1, - "MBConv_3x3_pw1"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp(8, 24, 24, 24, 1, - "MBConv_3x3_conv2"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp(24, 24, 24, 8, 1, - "MBConv_3x3_pw2"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp(8, 24, 24, 24, 1, - "MBConv_3x3_conv3"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp(24, 12, 12, 16, 1, - "MBConv_3x3_pw3"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv1"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 48, 12, 12, 16, 1, "MBConv_5x5_stage1_pw1"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv2"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 48, 12, 12, 16, 1, "MBConv_5x5_stage1_pw2"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv3"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 48, 6, 6, 32, 1, "MBConv_5x5_stage1_pw3"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv1"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 192, 6, 6, 32, 1, "MBConv_5x5_stage2_pw1"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv2"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 192, 6, 6, 32, 1, "MBConv_5x5_stage2_pw2"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv3"); - // kernel = 1, pad = 0, stride = 1 - paddle_mobile::TestConvBnReluOp( - 192, 6, 6, 64, 1, "MBConv_5x5_stage2_pw3"); - - return 0; -} diff --git a/mobile/test/operators/test_conv_gpu.cpp b/mobile/test/operators/test_conv_gpu.cpp deleted file mode 100644 index f9b1782b77..0000000000 --- a/mobile/test/operators/test_conv_gpu.cpp +++ /dev/null @@ -1,199 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_MOBILE_CL -#include -#include "../test_helper.h" -#include "../test_include.h" -#include "common/common.h" -#include "framework/cl/cl_helper.h" -#include "framework/cl/cl_image.h" -#include "operators/conv_op.h" -#include "operators/kernel/cl/cl-kernel-func/conv_func.h" - -namespace paddle_mobile { - -template -int TestConvOp(int in_channels, int in_height, int in_width, int out_channels, - int groups) { - int kernel_h = Kernel; - int kernel_w = Kernel; - int pad_h = Pad; - int pad_w = Pad; - int stride_h = Stride; - int stride_w = Stride; - int dilation_h = 1; - int dilation_w = 1; - - int batch_size = 1; - int input_c = in_channels; - int input_h = in_height; - int input_w = in_width; - int output_c = out_channels; - framework::DDim input_shape = - framework::make_ddim({batch_size, input_c, input_h, input_w}); - framework::DDim filter_shape = - framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w}); - - // std::cerr << " init " << std::endl; - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["Input"] = std::vector({"input"}); - inputs["Filter"] = std::vector({"filter"}); - outputs["Output"] = std::vector({"output"}); - cl_context context = scope->GetCLScpoe()->Context(); - cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue(); - - // std::cerr << " input " << std::endl; - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - const int in_numel = framework::product(input_shape); - float *in_data = new float[in_numel]; - for (int i = 0; i < in_numel; ++i) { - in_data[i] = (i % 36 / 6) + 1; - } - input->SetTensorData(in_data, input_shape); - input->InitNormalCLImage(context, command_queue); - DLOG << "input image \n" << *input; - - // std::cerr << " filter " << std::endl; - auto filter_var = scope.get()->Var("filter"); - auto filter = filter_var->template GetMutable(); - const int filter_numel = product(filter_shape); - float *filter_data = new float[filter_numel]; - for (int i = 0; i < filter_numel; ++i) { - filter_data[i] = i % 9; - } - filter->SetTensorData(filter_data, filter_shape); - - // std::cerr << " attrs " << std::endl; - framework::AttributeMap attrs; - attrs["strides"].Set>(std::vector({stride_h, stride_w})); - attrs["paddings"].Set>(std::vector({pad_h, pad_w})); - attrs["dilations"].Set>( - std::vector({dilation_h, dilation_w})); - attrs["groups"].Set(groups); - - std::cerr << " output " << std::endl; - auto output_var = scope.get()->Var("output"); - auto output = output_var->template GetMutable(); - - auto *op = new operators::ConvOp("conv2d", inputs, outputs, - attrs, scope.get()); - - op->InferShape(); - - framework::DDim ddim = output->dims(); - - DLOG << "output dims = " << ddim; - output->InitEmptyImage(context, command_queue, ddim); - - // std::cerr << " op->init " << std::endl; - op->Init(); - - auto time1 = time(); - op->Run(); - auto time2 = time(); - std::cerr << "time cost : " << time_diff(time1, time2) << std::endl; - - delete op; - return 0; -} - -} // namespace paddle_mobile - -int TestAll(const int in_channels, const int in_height, const int in_width, - const int out_channels, const int groups) { - std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height - << ", in_width=" << in_width << ", out_channels=" << out_channels - << ", groups=" << groups << std::endl; - std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - - return 0; -} -#endif - -int main() { - // TestAll(4, 6, 6, 4, 1); - // TestAll(6, 32, 32, 24, 1); - // TestAll(12, 32, 32, 24, 1); - // TestAll(24, 32, 32, 24, 1); - // TestAll(36, 32, 32, 24, 1); - // TestAll(48, 32, 32, 24, 1); - // TestAll(60, 32, 32, 24, 1); - // TestAll(72, 32, 32, 24, 1); - // TestAll(116, 32, 32, 24, 1); - // TestAll(232, 32, 32, 24, 1); - // TestAll(464, 32, 32, 24, 1); - // - // TestAll(6, 64, 64, 24, 1); - // TestAll(12, 64, 64, 24, 1); - // TestAll(24, 64, 64, 24, 1); - // TestAll(36, 64, 64, 24, 1); - // TestAll(48, 64, 64, 24, 1); - // TestAll(60, 64, 64, 24, 1); - // TestAll(72, 64, 64, 24, 1); - // TestAll(116, 64, 64, 24, 1); - // TestAll(232, 64, 64, 24, 1); - // TestAll(464, 64, 64, 24, 1); - // - // TestAll(6, 128, 128, 24, 1); - // TestAll(12, 128, 128, 24, 1); - // TestAll(24, 128, 128, 24, 1); - // TestAll(36, 128, 128, 24, 1); - // TestAll(48, 128, 128, 24, 1); - // TestAll(60, 128, 128, 24, 1); - // TestAll(72, 128, 128, 24, 1); - // TestAll(116, 128, 128, 24, 1); - // TestAll(232, 128, 128, 24, 1); - // TestAll(464, 128, 128, 24, 1); - // - // - // TestAll(6, 32, 32, 6, 1); - // TestAll(12, 32, 32, 12, 1); - // TestAll(24, 32, 32, 24, 1); - // TestAll(36, 32, 32, 36, 1); - // TestAll(48, 32, 32, 48, 1); - // TestAll(60, 32, 32, 60, 1); - // TestAll(72, 32, 32, 72, 1); - // TestAll(116, 32, 32, 116, 1); - // TestAll(232, 32, 32, 232, 1); - // TestAll(464, 32, 32, 464, 1); - // - // TestAll(6, 64, 64, 6, 1); - // TestAll(12, 64, 64, 12, 1); - // TestAll(24, 64, 64, 24, 1); - // TestAll(36, 64, 64, 36, 1); - // TestAll(48, 64, 64, 48, 1); - // TestAll(60, 64, 64, 60, 1); - // TestAll(72, 64, 64, 72, 1); - // TestAll(116, 64, 64, 116, 1); - // TestAll(232, 64, 64, 232, 1); - // TestAll(464, 64, 64, 464, 1); - // - // TestAll(6, 128, 128, 6, 1); - // TestAll(12, 128, 128, 12, 1); - // TestAll(24, 128, 128, 24, 1); - // TestAll(36, 128, 128, 36, 1); - // TestAll(48, 128, 128, 48, 1); - // TestAll(60, 128, 128, 60, 1); - // TestAll(72, 128, 128, 72, 1); - // TestAll(116, 128, 128, 116, 1); - // TestAll(232, 128, 128, 232, 1); - // TestAll(464, 128, 128, 464, 1); - return 0; -} diff --git a/mobile/test/operators/test_conv_op.cpp b/mobile/test/operators/test_conv_op.cpp deleted file mode 100644 index c705e162fe..0000000000 --- a/mobile/test/operators/test_conv_op.cpp +++ /dev/null @@ -1,358 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/conv_op.h" - -namespace paddle_mobile { - -// Reference convolution from Caffe for checking results. -// accumulate through explicit loops over input, output, and filters. -template -void conv2d(const framework::Tensor *input, const framework::Tensor *filter, - const framework::AttributeMap &attrs, framework::Tensor *output) { - framework::AttrReader attr_reader(attrs); - std::vector paddings = attr_reader.Get>("paddings"); - std::vector strides = attr_reader.Get>("strides"); - std::vector dilations = attr_reader.Get>("dilations"); - int groups = attr_reader.Get("groups"); - int kernel_h = filter->dims()[2]; - int kernel_w = filter->dims()[3]; - int pad_h = paddings[0]; - int pad_w = paddings[1]; - int stride_h = strides[0]; - int stride_w = strides[1]; - int dilation_h = dilations[0]; - int dilation_w = dilations[1]; - auto in_shape = input->dims(); - auto out_shape = output->dims(); - - const bool has_depth = 0; - int kernel_d, pad_d, stride_d, dilation_d; - if (has_depth) { - kernel_d = kernel_h; - stride_d = stride_h; - pad_d = pad_h; - dilation_d = dilation_h; - } else { - kernel_d = stride_d = dilation_d = 1; - pad_d = 0; - } - // Groups - int o_g = out_shape[1] / groups; - int k_g = in_shape[1] / groups; - int o_head, k_head; - // Convolution - vector weight_offset(4 + has_depth); - vector in_offset(4 + has_depth); - vector out_offset(4 + has_depth); - auto offset = [](const framework::Tensor *input, const vector &indics) { - framework::DDim shape = input->dims(); - size_t count = 0; - for (int i = 0; i < indics.size(); ++i) { - count *= shape[i]; - count += indics[i]; - } - return count; - }; - - const Itype *in_data = input->data(); - const Itype *w_data = filter->data(); - Otype *out_data = output->mutable_data(); - memset(out_data, 0, output->numel() * sizeof(Otype)); - for (int n = 0; n < out_shape[0]; n++) { - for (int g = 0; g < groups; g++) { - o_head = o_g * g; - k_head = k_g * g; - for (int o = 0; o < o_g; o++) { - for (int k = 0; k < k_g; k++) { - for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) { - for (int y = 0; y < out_shape[2 + has_depth]; y++) { - for (int x = 0; x < out_shape[3 + has_depth]; x++) { - for (int r = 0; r < kernel_d; r++) { - for (int p = 0; p < kernel_h; p++) { - for (int q = 0; q < kernel_w; q++) { - int in_z = z * stride_d - pad_d + r * dilation_d; - int in_y = y * stride_h - pad_h + p * dilation_h; - int in_x = x * stride_w - pad_w + q * dilation_w; - if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) && - in_y >= 0 && in_y < in_shape[2 + has_depth] && - in_x >= 0 && in_x < in_shape[3 + has_depth]) { - weight_offset[0] = o + o_head; - weight_offset[1] = k; - if (has_depth) { - weight_offset[2] = r; - } - weight_offset[2 + has_depth] = p; - weight_offset[3 + has_depth] = q; - in_offset[0] = n; - in_offset[1] = k + k_head; - if (has_depth) { - in_offset[2] = in_z; - } - in_offset[2 + has_depth] = in_y; - in_offset[3 + has_depth] = in_x; - out_offset[0] = n; - out_offset[1] = o + o_head; - if (has_depth) { - out_offset[2] = z; - } - out_offset[2 + has_depth] = y; - out_offset[3 + has_depth] = x; - - out_data[offset(output, out_offset)] += - in_data[offset(input, in_offset)] * - w_data[offset(filter, weight_offset)]; - } - } - } - } - } - } - } - } - } - } - } -} - -template -int TestConvOp(int in_channels, int in_height, int in_width, int out_channels, - int groups) { - int kernel_h = Kernel; - int kernel_w = Kernel; - int pad_h = Pad; - int pad_w = Pad; - int stride_h = Stride; - int stride_w = Stride; - int dilation_h = 1; - int dilation_w = 1; - - int batch_size = 1; - int input_c = in_channels; - int input_h = in_height; - int input_w = in_width; - int output_c = out_channels; - framework::DDim input_shape = - framework::make_ddim({batch_size, input_c, input_h, input_w}); - framework::DDim filter_shape = - framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w}); - - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["Input"] = std::vector({"input"}); - inputs["Filter"] = std::vector({"filter"}); - outputs["Output"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, input_shape, -20.0, 20.0); - - auto filter_var = scope.get()->Var("filter"); - auto filter = filter_var->template GetMutable(); - SetupTensor(filter, filter_shape, -20, 20); - - // for (int i = 0; i < input->numel(); ++i) { - // DLOG << "input[" << i << "] = " << float(input->data()[i]); - // } - // for (int i = 0; i < filter->numel(); ++i) { - // DLOG << "filter[" << i << "] = " << float(filter->data()[i]); - // } - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - attrs["strides"].Set>(std::vector({stride_h, stride_w})); - attrs["paddings"].Set>(std::vector({pad_h, pad_w})); - attrs["dilations"].Set>( - std::vector({dilation_h, dilation_w})); - attrs["groups"].Set(groups); - - auto *op = new operators::ConvOp("conv2d", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - // struct timespec ts_begin, ts_end; - // warmup - // op->Run(); - // clock_gettime(CLOCK_MONOTONIC, &ts_begin); - // for (int i = 0; i < 10; ++i) { - op->Run(); - // } - // clock_gettime(CLOCK_MONOTONIC, &ts_end); - // uint64_t elapsed = (ts_end.tv_sec - ts_begin.tv_sec) * 1e3 + - // (ts_end.tv_nsec - ts_begin.tv_nsec) / 1e6; - // LOG(kLOG_INFO) << "elapsed: " << elapsed / 10.0 << " ms"; - - // compare results - auto *output = output_var->template Get(); - framework::Tensor output_cmp; - output_cmp.mutable_data(output->dims()); - conv2d(input, filter, attrs, &output_cmp); - - const Otype *output_data = output->data(); - Otype *output_cmp_data = output_cmp.data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = abs(output_data[i] - output_cmp_data[i]); - // PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3, - // "output[%d] = %d, output_cmp[%d] = %d", i, - // output_data[i], i, output_cmp_data[i]); - if (gap > 1e-2 && (gap / (abs(output_data[i]) + 1e-5) > 1e-2)) { - std::cerr << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i << "] = " << output_cmp_data[i] - << std::endl; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int TestAll(const int in_channels, const int in_height, const int in_width, - const int out_channels, const int groups) { - std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height - << ", in_width=" << in_width << ", out_channels=" << out_channels - << ", groups=" << groups << std::endl; - std::cerr << "float, kernel=1, pad=0, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - - // kernel = 3, pad = 0, stride = 1 - std::cerr << "float, kernel=3, pad=0, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 1, stride = 1 - std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 2, stride = 1 - std::cerr << "float, kernel=3, pad=2, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 5, stride = 1 - std::cerr << "float, kernel=3, pad=5, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - - // kernel = 3, pad = 0, stride = 2 - std::cerr << "float, kernel=3, pad=0, stride=2" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 1, stride = 2 - std::cerr << "float, kernel=3, pad=1, stride=2" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 2, stride = 2 - std::cerr << "float, kernel=3, pad=2, stride=2" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 5, stride = 2 - std::cerr << "float, kernel=3, pad=5, stride=2" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - -#ifndef __aarch64__ - // kernel = 3, pad = 0, stride = 1 - std::cerr << "int8, kernel=3, pad=0, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 1, stride = 1 - std::cerr << "int8, kernel=3, pad=1, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 2, stride = 1 - std::cerr << "int8, kernel=3, pad=2, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 5, stride = 1 - std::cerr << "int8, kernel=3, pad=5, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - - // kernel = 3, pad = 0, stride = 2 - std::cerr << "int8, kernel=3, pad=0, stride=2" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 1, stride = 2 - std::cerr << "int8, kernel=3, pad=1, stride=2" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 2, stride = 2 - std::cerr << "int8, kernel=3, pad=2, stride=2" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 3, pad = 5, stride = 2 - std::cerr << "int8, kernel=3, pad=5, stride=2" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); -#endif // __aarch64__ - - // kernel = 5, pad = 0, stride = 1 - std::cerr << "float, kernel=5, pad=0, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 5, pad = 1, stride = 1 - std::cerr << "float, kernel=5, pad=1, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 5, pad = 2, stride = 1 - std::cerr << "float, kernel=5, pad=2, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 5, pad = 5, stride = 1 - std::cerr << "float, kernel=5, pad=5, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - -#ifndef __aarch64__ - // kernel = 5, pad = 0, stride = 1 - std::cerr << "int8, kernel=5, pad=0, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 5, pad = 1, stride = 1 - std::cerr << "int8, kernel=5, pad=1, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 5, pad = 2, stride = 1 - std::cerr << "int8, kernel=5, pad=2, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); - // kernel = 5, pad = 5, stride = 1 - std::cerr << "int8, kernel=5, pad=5, stride=1" << std::endl; - paddle_mobile::TestConvOp( - in_channels, in_height, in_width, out_channels, groups); -#endif // __aarch64__ - - return 0; -} - -int main() { - TestAll(16, 10, 10, 16, 16); - TestAll(1, 5, 5, 1, 1); - TestAll(1, 5, 5, 10, 1); - TestAll(10, 5, 5, 10, 10); - - TestAll(5, 33, 33, 5, 1); - TestAll(5, 33, 33, 13, 1); - TestAll(13, 33, 33, 13, 13); - - TestAll(5, 33, 13, 5, 1); - TestAll(5, 33, 13, 13, 1); - TestAll(13, 33, 13, 13, 13); - return 0; -} diff --git a/mobile/test/operators/test_depthwise_conv_op.cpp b/mobile/test/operators/test_depthwise_conv_op.cpp deleted file mode 100644 index 77c76eedc5..0000000000 --- a/mobile/test/operators/test_depthwise_conv_op.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/depthwise_conv_op.h" - -int main() { - paddle_mobile::framework::Loader loader; - // ../models/image_classification_resnet.inference.model - auto program = loader.Load(g_mobilenet_ssd); - - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); - - Executor4Test> - executor(program, "depthwise_conv2d"); - - paddle_mobile::framework::LoDTensor input; - // GetInput(g_test_image_1x3x224x224, &input, {1, 3, 224, 224}); - // use SetupTensor if not has local input image . - SetupTensor(&input, {1, 32, 150, 150}, static_cast(0), - static_cast(1)); - auto input_ptr = input.data(); - auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 150, 150}); - auto output = executor.Predict(input, "batch_norm_0.tmp_3", - "depthwise_conv2d_0.tmp_0", out_ddim); - - auto output_ptr = output->data(); - for (int j = 0; j < output->numel(); ++j) { - DLOG << " value of output: " << output_ptr[j]; - } - return 0; -} diff --git a/mobile/test/operators/test_dequantize_op.cpp b/mobile/test/operators/test_dequantize_op.cpp deleted file mode 100644 index 981439c66f..0000000000 --- a/mobile/test/operators/test_dequantize_op.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/dequantize_op.h" - -namespace paddle_mobile { - -void dequantize(const Tensor* input, const float scale, Tensor* output) { - const int32_t* x = input->data(); - float* y = output->mutable_data(); - size_t size = output->numel(); - for (size_t i = 0; i < size; ++i) { - y[i] = x[i] * scale; - } -} - -int TestDequqntizeOp() { - framework::DDim dim = framework::make_ddim({1, 3, 224, 224}); - - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - inputs["Scale"] = std::vector({"scale"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dim, -1000, 1000); - - auto scale_var = scope.get()->Var("scale"); - auto scale = scale_var->template GetMutable(); - scale->Resize(framework::make_ddim({1})); - scale->mutable_data()[0] = 1.27; - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - attrs["weight_scale"].Set(1.74); - - auto* op = new operators::DequantizeOp( - "dequantize", inputs, outputs, attrs, scope.get()); - op->InferShape(); - op->Run(); - auto output = output_var->template Get(); - const float* output_data = output->data(); - - framework::Tensor output_cmp; - output_cmp.Resize(dim); - float dequant_scale = 1.27 / 1.74; - dequantize(input, dequant_scale, &output_cmp); - const float* output_cmp_data = output_cmp.data(); - for (int i = 0; i < output->numel(); ++i) { - PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], - "output[%d] = %.6f, output_cmp[%d] = %.6f", i, - output_data[i], i, output_cmp_data[i]); - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main() { return paddle_mobile::TestDequqntizeOp(); } diff --git a/mobile/test/operators/test_dwconv_bn_relu_op.cpp b/mobile/test/operators/test_dwconv_bn_relu_op.cpp deleted file mode 100644 index 8b2e6f06b2..0000000000 --- a/mobile/test/operators/test_dwconv_bn_relu_op.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/fusion_dwconv_bn_relu_op.h" - -namespace paddle_mobile { - -template -int TestDWConvAddBnReluOp(int in_channels, int in_height, int in_width, - int out_channels, int groups, std::string opname) { - int kernel_h = Kernel; - int kernel_w = Kernel; - int pad_h = Pad; - int pad_w = Pad; - int stride_h = Stride; - int stride_w = Stride; - int dilation_h = 1; - int dilation_w = 1; - - int batch_size = 1; - int input_c = in_channels; - int input_h = in_height; - int input_w = in_width; - int output_c = out_channels; - framework::DDim input_shape = - framework::make_ddim({batch_size, input_c, input_h, input_w}); - framework::DDim filter_shape = - framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w}); - framework::DDim shape = framework::make_ddim({output_c}); - - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["Input"] = std::vector({"input"}); - inputs["Filter"] = std::vector({"filter"}); - inputs["Mean"] = std::vector({"mean"}); - inputs["Variance"] = std::vector({"variance"}); - inputs["Scale"] = std::vector({"scale"}); - inputs["Bias"] = std::vector({"bias"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, input_shape, -20.0, 20.0); - - auto filter_var = scope.get()->Var("filter"); - auto filter = filter_var->template GetMutable(); - SetupTensor(filter, filter_shape, -20, 20); - - auto mean_var = scope.get()->Var("mean"); - auto mean = mean_var->template GetMutable(); - SetupTensor(mean, shape, -10.0, 10.0); - - auto vari_var = scope.get()->Var("variance"); - auto vari = vari_var->template GetMutable(); - SetupTensor(vari, shape, -10.0, 10.0); - - auto scale_var = scope.get()->Var("scale"); - auto scale = scale_var->template GetMutable(); - SetupTensor(scale, shape, -10.0, 10.0); - - auto bias_var = scope.get()->Var("bias"); - auto bias = bias_var->template GetMutable(); - SetupTensor(bias, shape, -10.0, 10.0); - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - attrs["strides"].Set>(std::vector({stride_h, stride_w})); - attrs["paddings"].Set>(std::vector({pad_h, pad_w})); - attrs["dilations"].Set>( - std::vector({dilation_h, dilation_w})); - attrs["groups"].Set(groups); - attrs["epsilon"].Set(1e-6); - attrs["momentum"].Set(0.f); - - auto *op = new operators::FusionDWConvBNReluOp( - "fusion_dwconv_bn_relu", inputs, outputs, attrs, scope.get()); - op->InferShape(); - op->Init(); - for (int i = 0; i < 10; ++i) { - op->Run(); - } - auto time1 = time(); - for (int i = 0; i < 10; ++i) { - op->Run(); - } - auto time2 = time(); - std::ofstream out_file("./out_dwconv.txt", std::ios::app); - out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms" - << std::endl; - out_file.close(); - - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main(int argc, char *argv[]) { - // kernel = 3, pad = 1, stride = 1 - paddle_mobile::TestDWConvAddBnReluOp( - 16, 24, 24, 16, 16, "depthwise_seperable"); - // kernel = 3, pad = 1, stride = 1 - paddle_mobile::TestDWConvAddBnReluOp( - 24, 24, 24, 24, 24, "MBConv_3x3_dw1"); - // kernel = 3, pad = 1, stride = 1 - paddle_mobile::TestDWConvAddBnReluOp( - 24, 24, 24, 24, 24, "MBConv_3x3_dw2"); - // kernel = 3, pad = 1, stride = 2 - paddle_mobile::TestDWConvAddBnReluOp( - 24, 24, 24, 24, 24, "MBConv_3x3_dw3"); - // kernel = 5, pad = 2, stride = 1 - paddle_mobile::TestDWConvAddBnReluOp( - 48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw1"); - // kernel = 5, pad = 2, stride = 1 - paddle_mobile::TestDWConvAddBnReluOp( - 48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw2"); - // kernel = 5, pad = 2, stride = 2 - paddle_mobile::TestDWConvAddBnReluOp( - 48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw3"); - // kernel = 5, pad = 2, stride = 1 - paddle_mobile::TestDWConvAddBnReluOp( - 192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw1"); - // kernel = 5, pad = 2, stride = 1 - paddle_mobile::TestDWConvAddBnReluOp( - 192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw2"); - // kernel = 5, pad = 2, stride = 1 - paddle_mobile::TestDWConvAddBnReluOp( - 192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw3"); - - return 0; -} diff --git a/mobile/test/operators/test_elementwise_add_op.cpp b/mobile/test/operators/test_elementwise_add_op.cpp deleted file mode 100644 index 3922b216cf..0000000000 --- a/mobile/test/operators/test_elementwise_add_op.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" - -int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(g_resnet); - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); - - Executor4Test> - executor(program, "elementwise_add"); - - // 1. input_tensors; - vector input_tensors; - - Tensor input1; - auto input1_data = CreateInput(&input1, {1, 3, 224, 224}, 0, 1); - input_tensors.push_back(input1); - - Tensor input2; - auto input2_data = CreateInput(&input2, {224}, 0, 1); - input_tensors.push_back(input2); - - // 2. input_names - vector input_names({ - "batch_norm_2.tmp_2", - "batch_norm_0.tmp_3", - }); - - // 3. output_names - vector output_names({"elementwise_add_0.tmp_0"}); - - // 4. out_dims; - vector out_ddims; - auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224}); - out_ddims.push_back(out_ddim); - - auto output = executor.Predict(input_tensors, input_names, - output_names, out_ddims); - - auto output0_data = output[0]->data(); - /// output (1,3,224,224) - DLOG << "output memory size : " << output[0]->memory_size(); - DLOG << "output numel : " << output[0]->numel(); - - DLOG << input1_data[226] << " + " << input2_data[2] << " = " - << output0_data[226]; -} diff --git a/mobile/test/operators/test_elementwise_sub_op.cpp b/mobile/test/operators/test_elementwise_sub_op.cpp deleted file mode 100644 index d07d42849b..0000000000 --- a/mobile/test/operators/test_elementwise_sub_op.cpp +++ /dev/null @@ -1,157 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/elementwise_sub_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestElementwiseSubOp { - public: - explicit TestElementwiseSubOp(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - - const std::vector> blocks = - to_predict_program_->Blocks(); - // DLOG << " **block size " << blocks.size(); - for (int i = 0; i < blocks.size(); ++i) { - std::shared_ptr block_desc = blocks[i]; - std::vector> ops = block_desc->Ops(); - // DLOG << " ops " << ops.size(); - for (int j = 0; j < ops.size(); ++j) { - std::shared_ptr op = ops[j]; - if (op->Type() == "elementwise_sub" && - op->Input("X")[0] == "sigmoid_1.tmp_0") { - DLOG << " elementwise_sub attr size: " << op->GetAttrMap().size(); - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " outputs size: " << op->GetOutputs().size(); - - std::shared_ptr> lrn = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(lrn); - } - } - } - } - - std::shared_ptr predict_bn(const Tensor &t1, const Tensor &t2) { - // feed - auto scope = program_.scope.get(); - Variable *x1_feed_value = scope->Var("tmp_0"); - auto tensor_x1 = x1_feed_value->GetMutable(); - tensor_x1->ShareDataWith(t1); - - Variable *x2_feed_value = scope->Var("sigmoid_1.tmp_0"); - auto tensor_x2 = x2_feed_value->GetMutable(); - tensor_x2->ShareDataWith(t2); - - Variable *output = scope->Var("tmp_1"); - auto *output_tensor = output->GetMutable(); - output_tensor->mutable_data({1, 1, 6, 6}); - // DLOG << typeid(output_tensor).name(); - // DLOG << "output_tensor dims: " << output_tensor->dims(); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict_bn(t1, t2, 0); - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - - void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - DLOG << "op -> run()"; - op->Run(); - } - } -}; - -template class TestElementwiseSubOp; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run ElementwiseSub Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_ocr) + "/model", - std::string(g_ocr) + "/params"); - - /// input x1 (1,1,6,6) - paddle_mobile::framework::Tensor inputx1; - SetupTensor(&inputx1, {1, 1, 6, 6}, static_cast(0), - static_cast(1)); - auto *inputx1_ptr = inputx1.data(); - - /// input x2 (1,1,6,6) - paddle_mobile::framework::Tensor inputx2; - SetupTensor(&inputx2, {1, 1, 6, 6}, static_cast(0), - static_cast(1)); - auto *inputx2_ptr = inputx2.data(); - - paddle_mobile::framework::TestElementwiseSubOp - testElementwiseSubOp(program); - - auto output_op = testElementwiseSubOp.predict_bn(inputx1, inputx2); - auto *output_op_ptr = output_op->data(); - - auto inputx1_dim = inputx1.numel() / inputx1.dims()[0]; - DLOG << " input1 : "; - for (int i = 0; i < inputx1.dims()[0]; ++i) { - for (int j = 0; j < inputx1_dim; ++j) { - DLOGF("%f ", inputx1_ptr[i * inputx1_dim + j]); - } - DLOGF("\n"); - } - - auto inputx2_dim = inputx2.numel() / inputx2.dims()[0]; - DLOG << " input2 : "; - for (int i = 0; i < inputx2.dims()[0]; ++i) { - for (int j = 0; j < inputx2_dim; ++j) { - DLOGF("%f ", inputx2_ptr[i * inputx2_dim + j]); - } - DLOGF("\n"); - } - - auto output_dim = output_op->numel() / output_op->dims()[0]; - DLOG << " output : "; - for (int i = 0; i < output_op->dims()[0]; ++i) { - for (int j = 0; j < output_dim; ++j) { - DLOGF("%f ", output_op_ptr[i * output_dim + j]); - } - DLOGF("\n"); - } - - return 0; -} diff --git a/mobile/test/operators/test_expend_op.cpp b/mobile/test/operators/test_expend_op.cpp deleted file mode 100644 index cbe307ac69..0000000000 --- a/mobile/test/operators/test_expend_op.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_MOBILE_CL -#include "../executor_for_test_opencl.h" -#include "operators/expand_op.h" -#include "operators/feed_op.h" -#ifdef EXPAND_OP - -int main() { - const int IN_N = 1; - const int IN_C = 1; - const int IN_H = 2; - const int IN_W = 3; - - const int EXPEND_N = 1; - const int EXPEND_C = 1; - const int EXPEND_H = 2; - const int EXPEND_W = 2; - - const int OUT_N = IN_N * EXPEND_N; - const int OUT_C = IN_C * EXPEND_C; - const int OUT_H = IN_H * EXPEND_H; - const int OUT_W = IN_W * EXPEND_W; - - framework::DDim in_dims = framework::make_ddim({IN_N, IN_C, IN_H, IN_W}); - framework::DDim out_dims = framework::make_ddim({OUT_N, OUT_C, OUT_H, OUT_W}); - VariableNameMap inputs; - VariableNameMap outputs; - AttributeMap attrs; - inputs["X"] = std::vector({"op_in"}); - outputs["Out"] = std::vector({"op_out"}); - - std::vector expand_times = {EXPEND_N, EXPEND_C, EXPEND_H, EXPEND_W}; - attrs["expand_times"].Set>(expand_times); - - OpenClOpTester> tester; - tester.Predict("expend", in_dims, out_dims, inputs, outputs, attrs); -} -#endif - -#else -int main() {} -#endif diff --git a/mobile/test/operators/test_fill_constant_op.cpp b/mobile/test/operators/test_fill_constant_op.cpp deleted file mode 100644 index 86a4bf0a37..0000000000 --- a/mobile/test/operators/test_fill_constant_op.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/fill_constant_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestFillConstantOp { - public: - explicit TestFillConstantOp(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - const std::vector> blocks = - to_predict_program_->Blocks(); - for (auto block_desc : blocks) { - std::vector> ops = block_desc->Ops(); - for (auto op : ops) { - if (op->Type() == "fill_constant") { - DLOG << " attr size: " << op->GetAttrMap().size(); - std::unordered_map attrs = op->GetAttrMap(); - for (std::unordered_map::iterator it = - attrs.begin(); - it != attrs.end(); ++it) { - DLOG << " " << it->first << " " << it->second; - } - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " outputs size: " << op->GetOutputs().size(); - DLOG << " output is : " << op->Output("Out")[0]; - output_var_name = op->Output("Out")[0]; - std::shared_ptr> op_ptr = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(op_ptr); - } - } - } - } - - std::shared_ptr predict() { - auto scope = program_.scope.get(); - - Variable *output = scope->Var(output_var_name); - auto *output_tensor = output->GetMutable(); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict(0); - - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - string output_var_name; - - void predict(int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - op->Run(); - } - } -}; - -template class TestFillConstantOp; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run FillConstant Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_ocr) + "/model", - std::string(g_ocr) + "/params"); - - paddle_mobile::framework::TestFillConstantOp - testFillConstantOp(program); - - auto output = testFillConstantOp.predict(); - auto *output_ptr = output->data(); - - DLOG << "output : "; - for (int i = 0; i < output->numel(); ++i) { - DLOG << " index " << i << " : " << output_ptr[i]; - } - return 0; -} diff --git a/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp deleted file mode 100644 index 347bcb40a6..0000000000 --- a/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_include.h" -#include "operators/fusion_conv_add_bn_relu_op.h" - -int main() { - paddle_mobile::framework::Loader loader; - // ../models/image_classification_resnet.inference.model - auto program = loader.Load(g_mobilenet, true); - - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); - - Executor4Test> - executor(program, "fusion_conv_add_bn_relu", true); - - std::cout << "executor 4 test: " << std::endl; - - paddle_mobile::framework::Tensor input; - GetInput(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224}); - // // use SetupTensor if not has local input image . - // SetupTensor(&input, {1, 3, 224, 224}, static_cast(0), - // static_cast(1)); - - DLOG << " fuck: " << input; - - auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112}); - std::cout << "before predict: " << std::endl; - auto output = - executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim); - std::cout << "after predict " << std::endl; - auto output_ptr = output->data(); - - int stride = output->numel() / 100; - for (int i = 0; i < 100; i++) { - DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride]; - } - - // for (int i = 0; i < 100; i++) { - // DLOG << " index:" << i << " value: "<< output_ptr[i]; - // } - - // for (int j = 0; j < output->numel(); ++j) { - // std::cout << " (index: " << j << " value: " << output_ptr[j] << ") "; - // } - std::cout << std::endl; - return 0; -} diff --git a/mobile/test/operators/test_fusion_fc_op.cpp b/mobile/test/operators/test_fusion_fc_op.cpp deleted file mode 100644 index 60ed4976ec..0000000000 --- a/mobile/test/operators/test_fusion_fc_op.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_helper.h" -#include "../test_include.h" -#include "framework/operator.h" -#include "operators/fusion_fc_op.h" - -#define a(i, j) a[(i)*lda + (j)] -#define b(i, j) b[(i)*ldb + (j)] -#define c(i, j) c[(i)*ldc + (j)] - -namespace paddle_mobile { -using framework::AttributeMap; -using framework::DDim; -using framework::Scope; -using framework::make_ddim; - -int32_t qadd_int32(int32_t l, int32_t r) { - int64_t res = static_cast(l) + static_cast(r); - if (res > std::numeric_limits::max()) - return std::numeric_limits::max(); - else if (res < std::numeric_limits::min()) - return std::numeric_limits::min(); - else - return static_cast(res); -} - -// round to zero -float round2zero(float v) { - float res; - if (v > 0) - res = std::floor(v); - else if (v < 0) - res = std::ceil(v); - return res; -} - -int8_t qscale_int32(int32_t v, float scale) { - float res = static_cast(v) * scale; - res = round2zero(res); - if (res > 127) - return static_cast(127); - else if (res < -127) - return static_cast(-127); - else - return static_cast(res); -} - -template -int TestFcOP() { - int32_t m = 377; - int32_t n = 1363; - int32_t k = 577; - int32_t lda = k; - int32_t ldb = n; - int32_t ldc = n; - DDim inputA_shape = make_ddim({m, k}); - DDim inputB_shape = make_ddim({k, n}); - DDim bias_shape = make_ddim({n}); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputA"}); - inputs["Y"] = std::vector({"inputB"}); - inputs["Z"] = std::vector({"bias"}); - inputs["Scale"] = std::vector({"scale"}); - outputs["Out"] = std::vector({"output"}); - - auto inputA_var = scope.get()->Var("inputA"); - auto inputA = inputA_var->template GetMutable(); - SetupTensor(inputA, inputA_shape, -127, 127); - auto inputB_var = scope.get()->Var("inputB"); - auto inputB = inputB_var->template GetMutable(); - SetupTensor(inputB, inputB_shape, -127, 127); - auto bias_var = scope.get()->Var("bias"); - auto bias = bias_var->template GetMutable(); - SetupTensor(bias, bias_shape, -127, 127); - - framework::Tensor origin_matrix; - T *origin_inputB_ptr = origin_matrix.mutable_data(inputB_shape); - memcpy(origin_inputB_ptr, inputB->data(), - sizeof(*origin_inputB_ptr) * k * n); - - auto scale_var = scope.get()->Var("scale"); - auto scale = scale_var->template GetMutable(); - scale->Resize(framework::make_ddim({1})); - float scale_v = 0.000828f; - scale->mutable_data()[0] = scale_v; - - auto output_var = scope.get()->Var("output"); - AttributeMap attrs; - attrs["x_num_col_dims"].Set(1); - attrs["y_num_col_dims"].Set(1); - attrs["axis"].Set(1); - operators::OperatorBase *op = nullptr; - op = new operators::FusionFcOp("fusion_fc", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - auto output = output_var->template Get(); - const T *output_data = output->data(); - // compare - T *c = static_cast(memory::Alloc(sizeof(T) * m * n)); - T *a = inputA->data(); - T *b = origin_inputB_ptr; - S *bias_data = bias->data(); - for (int32_t i = 0; i < m; ++i) { - for (int32_t j = 0; j < n; ++j) { - S bias_v = bias_data[j]; - if (std::is_same::value) { - int32_t r = 0; - for (int32_t p = 0; p < k; p++) { - r += static_cast(a(i, p)) * static_cast(b(p, j)); - } - r = qadd_int32(r, bias_v); - c(i, j) = qscale_int32(r, scale_v); - } else { - T r = 0; - for (int32_t p = 0; p < k; p++) { - r += a(i, p) * b(p, j); - } - r += bias_v; - c(i, j) = r; - } - } - } - - int32_t eq = 0; - int32_t neq = 0; - for (int32_t i = 0; i < m * n; ++i) { - PADDLE_MOBILE_ENFORCE(output_data[i] == c[i], - "The execution of test_fusion_fc_op is failed!"); - if (output_data[i] == c[i]) { - ++eq; - } else { - ++neq; - } - } - std::cout << "mnk=" << m << " " << n << " " << k << " eq=" << eq - << " neq=" << neq << std::endl; - delete op; - return 0; -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - paddle_mobile::TestFcOP(); - return 0; -} diff --git a/mobile/test/operators/test_gru_op.cpp b/mobile/test/operators/test_gru_op.cpp deleted file mode 100644 index d17b2d6a2d..0000000000 --- a/mobile/test/operators/test_gru_op.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/gru_op.h" - -namespace paddle_mobile { - -template -int TestGruOp(int in_channels, int out_channels, std::string opname) { - size_t input_c = in_channels; - size_t output_c = out_channels; - paddle_mobile::framework::LoD lod{{0, input_c}}; - int batch_size = lod.size(); - framework::DDim input_shape = framework::make_ddim({input_c, output_c * 3}); - framework::DDim weight_shape = framework::make_ddim({output_c, output_c * 3}); - framework::DDim h0_shape = framework::make_ddim({batch_size, output_c}); - framework::DDim bias_shape = framework::make_ddim({batch_size, output_c * 3}); - - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["Input"] = std::vector({"input"}); - inputs["Weight"] = std::vector({"weight"}); - inputs["H0"] = std::vector({"h0"}); - inputs["Bias"] = std::vector({"bias"}); - - outputs["BatchGate"] = std::vector({"output_batch_gate"}); - outputs["BatchResetHiddenPrev"] = - std::vector({"output_batch_reset_hidden_prev"}); - outputs["BatchHidden"] = std::vector({"output_batch_hidden"}); - outputs["Hidden"] = std::vector({"output_hidden"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, input_shape, -127, 127); - input->set_lod(lod); - - auto weight_var = scope.get()->Var("weight"); - auto weight = weight_var->template GetMutable(); - SetupTensor(weight, weight_shape, -127, 127); - - auto h0_var = scope.get()->Var("h0"); - auto h0 = h0_var->template GetMutable(); - SetupTensor(h0, h0_shape, -127, 127); - - auto bias_var = scope.get()->Var("bias"); - auto bias = bias_var->template GetMutable(); - SetupTensor(bias, bias_shape, -127, 127); - - auto batch_gate_var = scope.get()->Var("output_batch_gate"); - auto batch_reset_hidden_prev_var = - scope.get()->Var("output_batch_reset_hidden_prev"); - auto batch_hidden_var = scope.get()->Var("output_batch_hidden"); - auto hidden_var = scope.get()->Var("output_hidden"); - - framework::AttributeMap attrs; - attrs["activation"].Set(std::string("relu")); - attrs["gate_activation"].Set(std::string("sigmoid")); - attrs["is_reverse"].Set(false); - - auto *op = new operators::GruOp("gru", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - for (int i = 0; i < 10; ++i) { - op->Run(); - } - auto time1 = time(); - for (int i = 0; i < 10; ++i) { - op->Run(); - } - auto time2 = time(); - std::ofstream out_file("./out_gru.txt", std::ios::app); - out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms" - << std::endl; - out_file.close(); - - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main(int argc, char *argv[]) { - paddle_mobile::TestGruOp(384, 120, "gru_forward"); - return 0; -} diff --git a/mobile/test/operators/test_im2sequence_op.cpp b/mobile/test/operators/test_im2sequence_op.cpp deleted file mode 100644 index 247e6a466f..0000000000 --- a/mobile/test/operators/test_im2sequence_op.cpp +++ /dev/null @@ -1,137 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/im2sequence_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestIm2SequenceOp { - public: - explicit TestIm2SequenceOp(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - - const std::vector> blocks = - to_predict_program_->Blocks(); - // DLOG << " **block size " << blocks.size(); - for (int i = 0; i < blocks.size(); ++i) { - std::shared_ptr block_desc = blocks[i]; - std::vector> ops = block_desc->Ops(); - // DLOG << " ops " << ops.size(); - for (int j = 0; j < ops.size(); ++j) { - std::shared_ptr op = ops[j]; - if (op->Type() == "im2sequence" && - op->Input("X")[0] == "conv2d_19.tmp_1") { - DLOG << " im2squence attr size: " << op->GetAttrMap().size(); - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " outputs size: " << op->GetOutputs().size(); - - std::shared_ptr> lrn = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(lrn); - } - } - } - } - - std::shared_ptr predict_bn(const Tensor &t1) { - // feed - auto scope = program_.scope.get(); - Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1"); - auto tensor_x1 = x1_feed_value->GetMutable(); - tensor_x1->ShareDataWith(t1); - Variable *output = scope->Var("im2sequence_0.tmp_0"); - auto *output_tensor = output->GetMutable(); - output_tensor->mutable_data({2, 12}); - // DLOG << typeid(output_tensor).name(); - // DLOG << "output_tensor dims: " << output_tensor->dims(); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict_bn(t1, 0); - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - - void predict_bn(const Tensor &t1, int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - DLOG << "op -> run()"; - op->Run(); - } - } -}; - -template class TestIm2SequenceOp; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run Im2Sequence Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_eng) + "/model", - std::string(g_eng) + "/params"); - - /// input x (4,10,2,2) - paddle_mobile::framework::Tensor inputx; - SetupTensor(&inputx, {1, 2, 6, 2}, static_cast(0), - static_cast(1)); - auto *inputx_ptr = inputx.data(); - - paddle_mobile::framework::TestIm2SequenceOp - testIm2SequenceOp(program); - - auto output_op = testIm2SequenceOp.predict_bn(inputx); - auto *output_op_ptr = output_op->data(); - - auto input_dim = inputx.numel() / inputx.dims()[0]; - DLOG << " input : "; - for (int i = 0; i < inputx.dims()[0]; ++i) { - for (int j = 0; j < input_dim; ++j) { - DLOGF("%f ", inputx_ptr[i * input_dim + j]); - } - DLOGF("\n"); - } - - auto output_dim = output_op->numel() / output_op->dims()[0]; - DLOG << " output : "; - for (int i = 0; i < output_op->dims()[0]; ++i) { - for (int j = 0; j < output_dim; ++j) { - DLOGF("%f ", output_op_ptr[i * output_dim + j]); - } - DLOGF("\n"); - } - - return 0; -} diff --git a/mobile/test/operators/test_increment_op.cpp b/mobile/test/operators/test_increment_op.cpp deleted file mode 100644 index 32f6a57b60..0000000000 --- a/mobile/test/operators/test_increment_op.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/increment_op.h" - -namespace paddle_mobile { - -template -void Increment(const framework::Tensor *input, framework::Tensor *out, - int step) { - auto input_data = input->data(); - auto out_data = out->data(); - *out_data = *input_data + step; -} - -int TestIncrementOp(const std::vector input_shape, int step) { - framework::DDim input_dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputX"}); - outputs["Out"] = std::vector({"output"}); - - auto x_var = scope.get()->Var("inputX"); - auto x = x_var->template GetMutable(); - SetupTensor(x, input_dims, 0, 100); - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - attrs["step"].Set(step); - - auto *op = new operators::IncrementOp( - "increment", inputs, outputs, attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - Increment(x, &output_cmp, step); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestIncrementOp({1}, 4); - paddle_mobile::TestIncrementOp({1}, 10); - DLOG << "test increment op pass."; - return 0; -} diff --git a/mobile/test/operators/test_is_empty_op.cpp b/mobile/test/operators/test_is_empty_op.cpp deleted file mode 100644 index 9bf9443acd..0000000000 --- a/mobile/test/operators/test_is_empty_op.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/is_empty_op.h" - -namespace paddle_mobile { - -void IsEmpty(const framework::Tensor *input, framework::Tensor *out) { - out->data()[0] = input->numel() == 0; -} - -int TestIsEmptyOp(const std::vector input_shape) { - framework::DDim input_dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputX"}); - outputs["Out"] = std::vector({"output"}); - - auto x_var = scope.get()->Var("inputX"); - auto x = x_var->template GetMutable(); - SetupTensor(x, input_dims, 0, 100); - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - - auto *op = new operators::IsEmptyOp("is_empty", inputs, outputs, - attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - framework::Tensor output_cmp; - bool *output_cmp_data = output_cmp.mutable_data(output->dims()); - IsEmpty(x, &output_cmp); - - const bool *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - if (output_data[i] != output_cmp_data[i]) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestIsEmptyOp({1, 3, 100, 100}); - paddle_mobile::TestIsEmptyOp({0}); - DLOG << "test is_empty op pass."; - return 0; -} diff --git a/mobile/test/operators/test_leaky_relu_op.cpp b/mobile/test/operators/test_leaky_relu_op.cpp deleted file mode 100644 index 3349fbd92c..0000000000 --- a/mobile/test/operators/test_leaky_relu_op.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/activation_op.h" - -namespace paddle_mobile { - -void LeakyRelu(const framework::Tensor *X, framework::Tensor *Y, float alpha) { - const float *x = X->data(); - float *y = Y->mutable_data(); - - for (int i = 0; i < X->numel(); ++i) { - y[i] = std::max(x[i], x[i] * alpha); - } -} - -int TestLeakyReluOp(const std::vector input_shape, float alpha) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - attrs["alpha"].Set(alpha); - - auto *op = new operators::LeakyReluOp( - "leaky_relu", inputs, outputs, attrs, scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - LeakyRelu(input, &output_cmp, alpha); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestLeakyReluOp({1, 1, 2, 3}, 0.2f); - paddle_mobile::TestLeakyReluOp({1, 3, 11, 22}, 0.3f); - paddle_mobile::TestLeakyReluOp({1, 32, 112, 112}, 0.4f); - std::cout << "test leaky_relu op pass." << std::endl; - return 0; -} diff --git a/mobile/test/operators/test_less_than_op.cpp b/mobile/test/operators/test_less_than_op.cpp deleted file mode 100644 index 35f5e6fe74..0000000000 --- a/mobile/test/operators/test_less_than_op.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/compare_op.h" - -namespace paddle_mobile { - -template -void LessThan(const framework::Tensor *X, const framework::Tensor *Y, - const int Axis, framework::Tensor *Out) { - const T *x = X->data(); - const T *y = Y->data(); - bool *output = Out->mutable_data(); - const auto &x_dims = X->dims(); - const auto &y_dims = Y->dims(); - /// axis = -1 represent the last dimensions. - int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis); - int batch = 1; - int channels = 1; - int elementwise_num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - elementwise_num *= x_dims[i]; - } - // less than - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int x_offset = (i * channels + j) * elementwise_num; - int y_offset = j * elementwise_num; - for (int k = 0; k < elementwise_num; ++k) { - output[x_offset + k] = (x[x_offset + k] < y[y_offset]); - } - } - } -} - -template -int TestLessThanOp(const std::vector &x_shape, - const std::vector &y_shape, const int axis) { - framework::DDim xdims = framework::make_ddim(x_shape); - framework::DDim ydims = framework::make_ddim(y_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputx"}); - inputs["Y"] = std::vector({"inputy"}); - outputs["Out"] = std::vector({"output"}); - - auto inputx_var = scope.get()->Var("inputx"); - auto inputx = inputx_var->template GetMutable(); - SetupTensor(inputx, xdims, static_cast(-100), static_cast(100)); - auto inputy_var = scope.get()->Var("inputy"); - auto inputy = inputy_var->template GetMutable(); - SetupTensor(inputy, ydims, static_cast(-100), static_cast(100)); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - attrs["axis"].Set(axis); - auto *op = new operators::LessThanOp("less_than", inputs, outputs, - attrs, scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - bool *output_cmp_data = output_cmp.mutable_data(output->dims()); - LessThan(inputx, inputy, axis, &output_cmp); - - const bool *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - if (output_data[i] != output_cmp_data[i]) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestLessThanOp({1, 2, 3}, {1, 2, 3}, 0); - paddle_mobile::TestLessThanOp({10, 2, 1}, {10, 2, 1}, 0); - - paddle_mobile::TestLessThanOp({2, 10, 1}, {1, 10, 1}, 1); - paddle_mobile::TestLessThanOp({10, 2, 1}, {1, 2, 1}, 1); - - paddle_mobile::TestLessThanOp({1, 2, 3}, {1, 2, 3}, 0); - paddle_mobile::TestLessThanOp({10, 2, 1}, {10, 2, 1}, 0); - - paddle_mobile::TestLessThanOp({2, 10, 1}, {1, 10, 1}, 1); - paddle_mobile::TestLessThanOp({10, 2, 1}, {1, 2, 1}, 1); - - std::cout << "test less_than op pass." << std::endl; - return 0; -} diff --git a/mobile/test/operators/test_log_op.cpp b/mobile/test/operators/test_log_op.cpp deleted file mode 100644 index f0bba93d54..0000000000 --- a/mobile/test/operators/test_log_op.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/activation_op.h" - -namespace paddle_mobile { - -void Log(const framework::Tensor *X, framework::Tensor *Y) { - const float *x = X->data(); - float *y = Y->mutable_data(); - - for (int i = 0; i < X->numel(); ++i) { - y[i] = log(x[i]); - } -} - -int TestLogOp(const std::vector input_shape) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, 0.0001, 100.0); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - auto *op = new operators::LogOp("log", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - Log(input, &output_cmp); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestLogOp({1, 1, 2, 3}); - paddle_mobile::TestLogOp({1, 3, 11, 22}); - paddle_mobile::TestLogOp({1, 32, 112, 112}); - return 0; -} diff --git a/mobile/test/operators/test_logical_and_op.cpp b/mobile/test/operators/test_logical_and_op.cpp deleted file mode 100644 index 380b253efe..0000000000 --- a/mobile/test/operators/test_logical_and_op.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/logical_op.h" - -namespace paddle_mobile { - -void LogicalAnd(const framework::Tensor *inputX, - const framework::Tensor *inputY, framework::Tensor *output) { - auto x_data = inputX->data(); - auto y_data = inputY->data(); - auto output_data = output->data(); - for (int i = 0; i < inputX->numel(); ++i) { - *output_data = *x_data && *y_data; - x_data++; - y_data++; - output_data++; - } -} - -int TestLogicalAndOp(const std::vector input_shape) { - framework::DDim input_dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputX"}); - inputs["Y"] = std::vector({"inputY"}); - outputs["Out"] = std::vector({"output"}); - - auto x_var = scope.get()->Var("inputX"); - auto x = x_var->template GetMutable(); - SetupTensor(x, input_dims, 0, 1); - - auto y_var = scope.get()->Var("inputY"); - auto y = y_var->template GetMutable(); - SetupTensor(y, input_dims, 0, 1); - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - - auto *op = new operators::LogicalAndOp( - "logical_and", inputs, outputs, attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - framework::Tensor output_cmp; - bool *output_cmp_data = output_cmp.mutable_data(output->dims()); - LogicalAnd(x, y, &output_cmp); - - const bool *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - if (output_data[i] != output_cmp_data[i]) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestLogicalAndOp({1, 1, 2, 3}); - paddle_mobile::TestLogicalAndOp({1, 3, 11, 12}); - paddle_mobile::TestLogicalAndOp({1, 16, 32, 32}); - DLOG << "test logical_and op pass."; - return 0; -} diff --git a/mobile/test/operators/test_logical_not_op.cpp b/mobile/test/operators/test_logical_not_op.cpp deleted file mode 100644 index 8d88362210..0000000000 --- a/mobile/test/operators/test_logical_not_op.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/logical_op.h" - -namespace paddle_mobile { - -void LogicalNot(const framework::Tensor *inputX, framework::Tensor *output) { - auto x_data = inputX->data(); - auto output_data = output->data(); - for (int i = 0; i < inputX->numel(); ++i) { - *output_data = !*x_data; - x_data++; - output_data++; - } -} - -int TestLogicalNotOp(const std::vector input_shape) { - framework::DDim input_dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputX"}); - outputs["Out"] = std::vector({"output"}); - - auto x_var = scope.get()->Var("inputX"); - auto x = x_var->template GetMutable(); - SetupTensor(x, input_dims, 0, 1); - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - - auto *op = new operators::LogicalNotOp( - "logical_not", inputs, outputs, attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - framework::Tensor output_cmp; - bool *output_cmp_data = output_cmp.mutable_data(output->dims()); - LogicalNot(x, &output_cmp); - - const bool *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - if (output_data[i] != output_cmp_data[i]) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestLogicalNotOp({1, 1, 2, 3}); - paddle_mobile::TestLogicalNotOp({1, 3, 11, 12}); - paddle_mobile::TestLogicalNotOp({1, 16, 32, 32}); - DLOG << "test logical_not op pass."; - return 0; -} diff --git a/mobile/test/operators/test_logical_or_op.cpp b/mobile/test/operators/test_logical_or_op.cpp deleted file mode 100644 index 9ea555b65b..0000000000 --- a/mobile/test/operators/test_logical_or_op.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/logical_op.h" - -namespace paddle_mobile { - -void LogicalOr(const framework::Tensor *inputX, const framework::Tensor *inputY, - framework::Tensor *output) { - auto x_data = inputX->data(); - auto y_data = inputY->data(); - auto output_data = output->data(); - for (int i = 0; i < inputX->numel(); ++i) { - *output_data = *x_data || *y_data; - x_data++; - y_data++; - output_data++; - } -} - -int TestLogicalOrOp(const std::vector input_shape) { - framework::DDim input_dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputX"}); - inputs["Y"] = std::vector({"inputY"}); - outputs["Out"] = std::vector({"output"}); - - auto x_var = scope.get()->Var("inputX"); - auto x = x_var->template GetMutable(); - SetupTensor(x, input_dims, 0, 1); - - auto y_var = scope.get()->Var("inputY"); - auto y = y_var->template GetMutable(); - SetupTensor(y, input_dims, 0, 1); - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - - auto *op = new operators::LogicalOrOp( - "logical_or", inputs, outputs, attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - framework::Tensor output_cmp; - bool *output_cmp_data = output_cmp.mutable_data(output->dims()); - LogicalOr(x, y, &output_cmp); - - const bool *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - if (output_data[i] != output_cmp_data[i]) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestLogicalOrOp({1, 1, 2, 3}); - paddle_mobile::TestLogicalOrOp({1, 3, 11, 12}); - paddle_mobile::TestLogicalOrOp({1, 16, 32, 32}); - DLOG << "test logical_or op pass."; - return 0; -} diff --git a/mobile/test/operators/test_logical_xor_op.cpp b/mobile/test/operators/test_logical_xor_op.cpp deleted file mode 100644 index a776de0e8b..0000000000 --- a/mobile/test/operators/test_logical_xor_op.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/logical_op.h" - -namespace paddle_mobile { - -void LogicalXor(const framework::Tensor *inputX, - const framework::Tensor *inputY, framework::Tensor *output) { - auto x_data = inputX->data(); - auto y_data = inputY->data(); - auto output_data = output->data(); - for (int i = 0; i < inputX->numel(); ++i) { - bool x = *x_data; - bool y = *y_data; - *output_data = (x || y) && !(x && y); - x_data++; - y_data++; - output_data++; - } -} - -int TestLogicalXorOp(const std::vector input_shape) { - framework::DDim input_dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputX"}); - inputs["Y"] = std::vector({"inputY"}); - outputs["Out"] = std::vector({"output"}); - - auto x_var = scope.get()->Var("inputX"); - auto x = x_var->template GetMutable(); - SetupTensor(x, input_dims, 0, 1); - - auto y_var = scope.get()->Var("inputY"); - auto y = y_var->template GetMutable(); - SetupTensor(y, input_dims, 0, 1); - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - - auto *op = new operators::LogicalXorOp( - "logical_xor", inputs, outputs, attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - framework::Tensor output_cmp; - bool *output_cmp_data = output_cmp.mutable_data(output->dims()); - LogicalXor(x, y, &output_cmp); - - const bool *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - if (output_data[i] != output_cmp_data[i]) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestLogicalXorOp({1, 1, 2, 3}); - paddle_mobile::TestLogicalXorOp({1, 3, 11, 12}); - paddle_mobile::TestLogicalXorOp({1, 16, 32, 32}); - DLOG << "test logical_xor op pass."; - return 0; -} diff --git a/mobile/test/operators/test_lrn_op.cpp b/mobile/test/operators/test_lrn_op.cpp deleted file mode 100644 index 5d1ac9b4dd..0000000000 --- a/mobile/test/operators/test_lrn_op.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/lrn_op.h" - -int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(g_googlenet); - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); - - Executor4Test> - executor(program, "lrn"); - - // 1. input_tensors; - vector input_tensors; - - Tensor input1; - auto input1_data = CreateInput(&input1, {3, 4, 2, 2}, 0, 1); - input_tensors.push_back(input1); - - // 2. input_names - vector input_names({ - "pool2d_0.tmp_0", - }); - - // 3. output_names - vector output_names({"pool1_norm1.tmp_1"}); - - // 4. out_dims; - vector out_ddims; - auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2}); - out_ddims.push_back(out_ddim); - - auto output = executor.Predict(input_tensors, input_names, - output_names, out_ddims); - - auto output0_data = output[0]->data(); - - DLOG << " LrnOp input: "; - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 4; j++) { - for (int c = 0; c < 2; c++) { - for (int d = 0; d < 2; d++) { - DLOGF("%f ", input1_data[i * 16 + j * 4 + c * 2 + d]); - } - DLOGF("\n"); - } - DLOGF("\n"); - } - DLOGF("\n"); - } - DLOG << " LrnOp output: "; - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 4; j++) { - for (int c = 0; c < 2; c++) { - for (int d = 0; d < 2; d++) { - DLOGF("%f ", output0_data[i * 16 + j * 4 + c * 2 + d]); - } - DLOGF("\n"); - } - DLOGF("\n"); - } - DLOGF("\n"); - } - DLOG << input1_data[0] << " / ((1 + 0.00002 * ( " << input1_data[0] << "^2 + " - << input1_data[4] << "^2 + " << input1_data[8] << "^2 ))^0.75) = "; - DLOG << output0_data[0]; - return 0; -} diff --git a/mobile/test/operators/test_mul_op.cpp b/mobile/test/operators/test_mul_op.cpp deleted file mode 100644 index 6ac2c45564..0000000000 --- a/mobile/test/operators/test_mul_op.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/mul_op.h" - -#define a(i, j) a[(i)*lda + (j)] -#define b(i, j) b[(i)*ldb + (j)] -#define c(i, j) c[(i)*ldc + (j)] - -namespace paddle_mobile { -using framework::AttributeMap; -using framework::DDim; -using framework::Scope; -using framework::make_ddim; -template -int TestMulOP() { - int32_t m = 1024; - int32_t n = 1024; - int32_t k = 1024; - int32_t lda = k; - int32_t ldb = n; - int32_t ldc = n; - DDim inputA_shape = make_ddim({m, k}); - DDim inputB_shape = make_ddim({k, n}); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"inputA"}); - inputs["Y"] = std::vector({"inputB"}); - outputs["Out"] = std::vector({"output"}); - - auto inputA_var = scope.get()->Var("inputA"); - auto inputA = inputA_var->template GetMutable(); - SetupTensor(inputA, inputA_shape, -127, 127); - auto inputB_var = scope.get()->Var("inputB"); - auto inputB = inputB_var->template GetMutable(); - SetupTensor(inputB, inputB_shape, -127, 127); - - auto output_var = scope.get()->Var("output"); - AttributeMap attrs; - attrs["x_num_col_dims"].Set(1); - attrs["y_num_col_dims"].Set(1); - auto *op = new operators::MulOp("mul", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Run(); - auto output = output_var->template Get(); - const O *output_data = output->data(); - // compare - O *c = static_cast(memory::Alloc(sizeof(O) * m * n)); - I *a = inputA->data(); - I *b = inputB->data(); - for (int32_t i = 0; i < m; ++i) { - for (int32_t j = 0; j < n; ++j) { - O r = 0; - for (int32_t p = 0; p < k; p++) { - r += static_cast(a(i, p)) * static_cast(b(p, j)); - } - c(i, j) = r; - } - } - - int32_t eq = 0; - int32_t neq = 0; - for (int32_t i = 0; i < m * n; ++i) { - PADDLE_MOBILE_ENFORCE( - output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i, - static_cast(output_data[i]), i, static_cast(c[i])); - if (output_data[i] == c[i]) { - ++eq; - } else { - ++neq; - } - } - std::cout << "mnk=" << m << " " << n << " " << k << " eq=" << eq - << " neq=" << neq << std::endl; - delete op; - return 0; -} -} // namespace paddle_mobile - -int main() { - paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); - paddle_mobile::TestMulOP(); - paddle_mobile::TestMulOP(); - return 0; -} diff --git a/mobile/test/operators/test_multiclass_nms_op.cpp b/mobile/test/operators/test_multiclass_nms_op.cpp deleted file mode 100644 index 782dd6af94..0000000000 --- a/mobile/test/operators/test_multiclass_nms_op.cpp +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/multiclass_nms_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestMultiClassNMSOp { - public: - explicit TestMultiClassNMSOp(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - - const std::vector> blocks = - to_predict_program_->Blocks(); - for (auto block_desc : blocks) { - std::vector> ops = block_desc->Ops(); - for (auto op : ops) { - if (op->Type() == "multiclass_nms" && - op->Input("BBoxes")[0] == "box_coder_0.tmp_0") { - DLOG << " attr size: " << op->GetAttrMap().size(); - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " outputs size: " << op->GetOutputs().size(); - DLOG << " BBoxes is : " << op->Input("BBoxes")[0]; - DLOG << " Scores is : " << op->Input("Scores")[0]; - DLOG << " Out is : " << op->Output("Out")[0]; - DLOG << " keep_top_k : " - << op->GetAttrMap().at("keep_top_k").Get(); - DLOG << " background_label : " - << op->GetAttrMap().at("background_label").Get(); - DLOG << " nms_eta : " << op->GetAttrMap().at("nms_eta").Get(); - DLOG << " nms_threshold : " - << op->GetAttrMap().at("nms_threshold").Get(); - DLOG << " nms_top_k : " - << op->GetAttrMap().at("nms_top_k").Get(); - DLOG << " score_threshold : " - << op->GetAttrMap().at("score_threshold").Get(); - std::shared_ptr> priorbox = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(priorbox); - } - } - } - } - - std::shared_ptr predict(const Tensor &t1, const Tensor &t2) { - // feed - auto scope = program_.scope.get(); - Variable *x1_feed_value = scope->Var("box_coder_0.tmp_0"); - auto tensor_x1 = x1_feed_value->GetMutable(); - tensor_x1->ShareDataWith(t1); - - Variable *x2_feed_value = scope->Var("transpose_12.tmp_0"); - auto tensor_x2 = x2_feed_value->GetMutable(); - tensor_x2->ShareDataWith(t2); - - Variable *output = scope->Var("detection_output_0.tmp_0"); - auto *output_tensor = output->GetMutable(); - output_tensor->mutable_data({1917, 6}); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict(t1, t2, 0); - - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - - void predict(const Tensor &t1, const Tensor &t2, int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - DLOG << "op -> run()"; - op->Run(); - } - } -}; - -template class TestMultiClassNMSOp; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run MulticlassNMS Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_mobilenet_ssd)); - paddle_mobile::framework::Tensor inputx1; - SetupTensor(&inputx1, {1, 2, 4}, static_cast(0), - static_cast(1)); - auto *inputx1_ptr = inputx1.data(); - const float x1[] = {0, 0, 100, 100, 50, 50, 150, 150}; - for (int i = 0; i < 8; ++i) { - *(inputx1_ptr + i) = x1[i]; - } - - paddle_mobile::framework::Tensor inputx2; - SetupTensor(&inputx2, {1, 2, 2}, static_cast(0), - static_cast(1)); - auto *inputx2_ptr = inputx2.data(); - const float x2[] = {0.4, 0.3, 0.6, 0.7}; - for (int i = 0; i < 4; ++i) { - *(inputx2_ptr + i) = x2[i]; - } - - paddle_mobile::framework::TestMultiClassNMSOp - testMultiClassNMSOp(program); - - auto output = testMultiClassNMSOp.predict(inputx1, inputx2); - auto *output_ptr = output->data(); - - for (int i = 0; i < output->numel(); ++i) { - DLOG << output_ptr[i]; - } - - // test multi point - paddle_mobile::framework::Tensor inputx3; - SetupTensor(&inputx3, {1, 2, 8}, static_cast(0), - static_cast(1)); - auto *inputx3_ptr = inputx3.data(); - const float x3[] = {0, 0, 100, 0, 100, 100, 0, 100, - 50, 50, 150, 50, 150, 150, 50, 150}; - for (int i = 0; i < 16; ++i) { - *(inputx3_ptr + i) = x3[i]; - } - - auto output2 = testMultiClassNMSOp.predict(inputx3, inputx2); - auto *output_ptr2 = output2->data(); - - for (int i = 0; i < output2->numel(); ++i) { - DLOG << output_ptr2[i]; - } - return 0; -} diff --git a/mobile/test/operators/test_polygon_box_transform_op.cpp b/mobile/test/operators/test_polygon_box_transform_op.cpp deleted file mode 100644 index bfd8fb3cc2..0000000000 --- a/mobile/test/operators/test_polygon_box_transform_op.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/polygon_box_transform_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestPolygonBoxTransformOp { - public: - explicit TestPolygonBoxTransformOp(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - const std::vector> blocks = - to_predict_program_->Blocks(); - for (auto block_desc : blocks) { - std::vector> ops = block_desc->Ops(); - for (auto op : ops) { - if (op->Type() == "polygon_box_transform") { - DLOG << " attr size: " << op->GetAttrMap().size(); - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " input is : " << op->Input("Input")[0]; - input_var_name = op->Input("Input")[0]; - DLOG << " outputs size: " << op->GetOutputs().size(); - DLOG << " output is : " << op->Output("Output")[0]; - output_var_name = op->Output("Output")[0]; - std::shared_ptr> - op_ptr = std::make_shared< - operators::PolygonBoxTransformOp>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(op_ptr); - return; - } - } - } - } - - std::shared_ptr predict(const Tensor &t) { - auto scope = program_.scope.get(); - Variable *input_feed_value = scope->Var(input_var_name); - auto tensor_input = input_feed_value->GetMutable(); - tensor_input->ShareDataWith(t); - - Variable *output = scope->Var(output_var_name); - auto *output_tensor = output->GetMutable(); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict(t, 0); - - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - string input_var_name; - string output_var_name; - - void predict(const Tensor &t, int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - op->Run(); - } - } -}; - -template class TestPolygonBoxTransformOp; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run PolygonBoxTransform Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_ocr)); - - paddle_mobile::framework::Tensor input; - SetupTensor(&input, {1, 8, 1, 2}, static_cast(0), - static_cast(1)); - auto *input_ptr = input.data(); - for (int i = 0; i < 16; ++i) { - *(input_ptr + i) = i; - } - DLOG << "input : "; - for (int i = 0; i < input.numel(); ++i) { - DLOG << " index " << i << " : " << input_ptr[i]; - } - - paddle_mobile::framework::TestPolygonBoxTransformOp - testPolygonBoxTransformOp(program); - - auto output = testPolygonBoxTransformOp.predict(input); - auto *output_ptr = output->data(); - - DLOG << "output : "; - for (int i = 0; i < output->numel(); ++i) { - DLOG << " index " << i << " : " << output_ptr[i]; - } - return 0; -} diff --git a/mobile/test/operators/test_pool_op.cpp b/mobile/test/operators/test_pool_op.cpp deleted file mode 100644 index 44bb132e79..0000000000 --- a/mobile/test/operators/test_pool_op.cpp +++ /dev/null @@ -1,231 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_include.h" -#include "operators/math/pooling.h" -#include "operators/pool_op.h" - -namespace paddle_mobile { - -namespace math = operators::math; - -template -int TestPoolOp(int in_channels, int in_height, int in_width) { - int kernel_h = Kernel; - int kernel_w = Kernel; - int pad_h = Pad; - int pad_w = Pad; - int stride_h = Stride; - int stride_w = Stride; - std::string pooling_type = (PoolType == 0 ? "max" : "avg"); - - int batch_size = 1; - int input_c = in_channels; - int input_h = in_height; - int input_w = in_width; - - framework::DDim input_shape = - framework::make_ddim({batch_size, input_c, input_h, input_w}); - - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, input_shape, -127, 127); - - // for (int i = 0; i < input->numel(); ++i) { - // DLOG << "input[" << i << "] = " << input->data()[i]; - // } - - auto output_var = scope.get()->Var("output"); - framework::AttributeMap attrs; - attrs["pooling_type"].Set(pooling_type); - attrs["ksize"].Set>(std::vector({kernel_h, kernel_w})); - attrs["strides"].Set>(std::vector({stride_h, stride_w})); - attrs["paddings"].Set>(std::vector({pad_h, pad_w})); - attrs["ceil_mode"].Set(true); - // attrs["ceil_mode"].Set(false); - attrs["global_pooling"].Set(false); - attrs["exclusive"].Set(true); - - auto *op = new operators::PoolOp("pool2d", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - framework::Tensor output_cmp; - output_cmp.mutable_data(output->dims()); - - if (pooling_type == "avg") { - math::Pooling()(*input, std::vector{kernel_h, kernel_w}, - std::vector{stride_h, stride_w}, - std::vector{pad_h, pad_w}, &output_cmp); - } else { - math::Pooling()(*input, std::vector{kernel_h, kernel_w}, - std::vector{stride_h, stride_w}, - std::vector{pad_h, pad_w}, &output_cmp); - } - - // compare results - const float *output_data = output->data(); - float *output_cmp_data = output_cmp.data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - // PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], - // "output[%d] = %d, output_cmp[%d] = %d", i, - // output_data[i], i, output_cmp_data[i]); - if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - exit(1); - } - } - delete op; - return 0; -} -} // namespace paddle_mobile - -int Test(const int in_channels, const int in_height, const int in_width) { - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=3, pad=0, stride=1"; - paddle_mobile::TestPoolOp<0, 3, 0, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=3, pad=1, stride=1"; - paddle_mobile::TestPoolOp<0, 3, 1, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=3, pad=2, stride=1"; - paddle_mobile::TestPoolOp<0, 3, 2, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=3, pad=5, stride=1"; - paddle_mobile::TestPoolOp<0, 3, 5, 1>(in_channels, in_height, in_width); - - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=3, pad=0, stride=1"; - paddle_mobile::TestPoolOp<1, 3, 0, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=3, pad=1, stride=1"; - paddle_mobile::TestPoolOp<1, 3, 1, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=3, pad=2, stride=1"; - paddle_mobile::TestPoolOp<1, 3, 2, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=3, pad=5, stride=1"; - paddle_mobile::TestPoolOp<1, 3, 5, 1>(in_channels, in_height, in_width); - - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=3, pad=0, stride=2"; - paddle_mobile::TestPoolOp<0, 3, 0, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=3, pad=1, stride=2"; - paddle_mobile::TestPoolOp<0, 3, 1, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=3, pad=2, stride=2"; - paddle_mobile::TestPoolOp<0, 3, 2, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=3, pad=5, stride=2"; - paddle_mobile::TestPoolOp<0, 3, 5, 2>(in_channels, in_height, in_width); - - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=3, pad=0, stride=2"; - paddle_mobile::TestPoolOp<1, 3, 0, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=3, pad=1, stride=2"; - paddle_mobile::TestPoolOp<1, 3, 1, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=3, pad=2, stride=2"; - paddle_mobile::TestPoolOp<1, 3, 2, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=3, pad=5, stride=2"; - paddle_mobile::TestPoolOp<1, 3, 5, 2>(in_channels, in_height, in_width); - - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=2, pad=0, stride=1"; - paddle_mobile::TestPoolOp<0, 2, 0, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=2, pad=1, stride=1"; - paddle_mobile::TestPoolOp<0, 2, 1, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=2, pad=2, stride=1"; - paddle_mobile::TestPoolOp<0, 2, 2, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=2, pad=5, stride=1"; - paddle_mobile::TestPoolOp<0, 2, 5, 1>(in_channels, in_height, in_width); - - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=2, pad=0, stride=1"; - paddle_mobile::TestPoolOp<1, 2, 0, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=2, pad=1, stride=1"; - paddle_mobile::TestPoolOp<1, 2, 1, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=2, pad=2, stride=1"; - paddle_mobile::TestPoolOp<1, 2, 2, 1>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=2, pad=5, stride=1"; - paddle_mobile::TestPoolOp<1, 2, 5, 1>(in_channels, in_height, in_width); - - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=2, pad=0, stride=2"; - paddle_mobile::TestPoolOp<0, 2, 0, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=2, pad=1, stride=2"; - paddle_mobile::TestPoolOp<0, 2, 1, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=2, pad=2, stride=2"; - paddle_mobile::TestPoolOp<0, 2, 2, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=max, kernel=2, pad=5, stride=2"; - paddle_mobile::TestPoolOp<0, 2, 5, 2>(in_channels, in_height, in_width); - - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=2, pad=0, stride=2"; - paddle_mobile::TestPoolOp<1, 2, 0, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=2, pad=1, stride=2"; - paddle_mobile::TestPoolOp<1, 2, 1, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=2, pad=2, stride=2"; - paddle_mobile::TestPoolOp<1, 2, 2, 2>(in_channels, in_height, in_width); - LOG(paddle_mobile::kLOG_INFO) - << "float, pooling_type=avg, kernel=2, pad=5, stride=2"; - paddle_mobile::TestPoolOp<1, 2, 5, 2>(in_channels, in_height, in_width); -} - -int main(int argc, char *argv[]) { - // if (argc < 4) { - // LOG(paddle_mobile::kLOG_INFO) - // << "Usage:\n" - // << " ./test-pool-op in_channels in_height in_width \n" - // << " params:\n" - // << " -in_channels: int, input image's channels\n" - // << " -in_height: int, input image's height\n" - // << " -in_width: int, input image's width\n"; - // return 1; - // } - // int in_channels = atoi(argv[1]); - // int in_height = atoi(argv[2]); - // int in_width = atoi(argv[3]); - Test(1, 10, 10); - Test(1, 50, 50); - Test(32, 10, 10); - Test(32, 50, 50); -} diff --git a/mobile/test/operators/test_prelu_op.cpp b/mobile/test/operators/test_prelu_op.cpp deleted file mode 100644 index f98c9904ae..0000000000 --- a/mobile/test/operators/test_prelu_op.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../executor_for_test.h" -#include "../test_include.h" -#include "operators/prelu_op.h" - -int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(g_resnet); - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); - - Executor4Test> - executor(program, "prelu"); - - // 1. input_tensors; - vector input_tensors; - - Tensor input1; - auto input1_data = CreateInput(&input1, {1, 2, 3, 4}, -1, 1); - input_tensors.push_back(input1); - - // 2. input_names - vector input_names({ - "batch_norm_0.tmp_2", - }); - - // 3. output_names - vector output_names({"batch_norm_0.tmp_3"}); - - // 4. out_dims; - vector out_ddims; - auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4}); - out_ddims.push_back(out_ddim); - - auto output = executor.Predict(input_tensors, input_names, - output_names, out_ddims); - - auto output0_data = output[0]->data(); - - for (int j = 0; j < output[0]->numel(); ++j) { - DLOG << " value of output: " << output0_data[j]; - } - return 0; -} diff --git a/mobile/test/operators/test_prior_box_op.cpp b/mobile/test/operators/test_prior_box_op.cpp deleted file mode 100644 index b2f05a18e6..0000000000 --- a/mobile/test/operators/test_prior_box_op.cpp +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/prior_box_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestPriorBoxOp { - public: - explicit TestPriorBoxOp(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - - const std::vector> blocks = - to_predict_program_->Blocks(); - // DLOG << " **block size " << blocks.size(); - for (auto block_desc : blocks) { - std::vector> ops = block_desc->Ops(); - // DLOG << " ops " << ops.size(); - for (auto op : ops) { - if (op->Type() == "prior_box" && - op->Input("Input")[0] == "batch_norm_26.tmp_3") { - DLOG << " mul attr size: " << op->GetAttrMap().size(); - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " outputs size: " << op->GetOutputs().size(); - DLOG << " Input is : " << op->Input("Input")[0]; - DLOG << " Image is : " << op->Input("Image")[0]; - DLOG << " Output Boxes is : " << op->Output("Boxes")[0]; - DLOG << " Output Variances is : " << op->Output("Variances")[0]; - DLOG << " offset : " << op->GetAttrMap().at("offset").Get(); - DLOG << " step_h : " << op->GetAttrMap().at("step_h").Get(); - DLOG << " step_w : " << op->GetAttrMap().at("step_w").Get(); - DLOG << " flip : " << op->GetAttrMap().at("flip").Get(); - DLOG << " clip : " << op->GetAttrMap().at("clip").Get(); - // DLOG << " variances : " << - // op->GetAttrMap().at("variances").Get>(); - // DLOG << " aspect_ratios : " << - // op->GetAttrMap().at("aspect_ratios").Get>(); - // DLOG << " min_sizes : " << - // op->GetAttrMap().at("min_sizes").Get>(); - // DLOG << " max_sizes : " << - // op->GetAttrMap().at("max_sizes").Get>(); - std::shared_ptr> priorbox = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(priorbox); - } - } - } - } - - std::shared_ptr predict_priorbox(const Tensor &t1, const Tensor &t2) { - // feed - auto scope = program_.scope.get(); - Variable *x1_feed_value = scope->Var("image"); - auto tensor_x1 = x1_feed_value->GetMutable(); - tensor_x1->ShareDataWith(t1); - - Variable *x2_feed_value = scope->Var("batch_norm_26.tmp_3"); - auto tensor_x2 = x2_feed_value->GetMutable(); - tensor_x2->ShareDataWith(t2); - - Variable *boxes_output = scope->Var("prior_box_1.tmp_0"); - auto *boxes_output_tensor = boxes_output->GetMutable(); - boxes_output_tensor->mutable_data({10, 10, 6, 4}); - - Variable *variances_output = scope->Var("prior_box_1.tmp_1"); - auto *variances_output_tesnor = variances_output->GetMutable(); - variances_output_tesnor->mutable_data({10, 10, 6, 4}); - // DLOG << typeid(output_tensor).name(); - // DLOG << "output_tensor dims: " << output_tensor->dims(); - - std::shared_ptr outboxes_tensor = std::make_shared(); - outboxes_tensor.reset(boxes_output_tensor); - - std::shared_ptr outvars_tensor = std::make_shared(); - outvars_tensor.reset(variances_output_tesnor); - predict_priorbox(t1, t2, 0); - - return outboxes_tensor; - // return outvars_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - - void predict_priorbox(const Tensor &t1, const Tensor &t2, int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - DLOG << "op -> run()"; - op->Run(); - } - } -}; - -template class TestPriorBoxOp; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run PriorBoxOp Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_mobilenet_ssd)); - - /// input x (1,3,300,300) - paddle_mobile::framework::Tensor input_image; - SetupTensor(&input_image, {1, 3, 300, 300}, static_cast(0), - static_cast(1)); - auto *input_image_ptr = input_image.data(); - - paddle_mobile::framework::Tensor inputx1; - SetupTensor(&inputx1, {1, 1024, 10, 10}, static_cast(0), - static_cast(1)); - auto *inputx1_ptr = inputx1.data(); - - paddle_mobile::framework::TestPriorBoxOp testPriorBoxOp( - program); - - auto output_priorbox = testPriorBoxOp.predict_priorbox(input_image, inputx1); - auto *output_priorbox_ptr = output_priorbox->data(); - - for (int i = 0; i < output_priorbox->numel(); i++) { - DLOG << output_priorbox_ptr[i]; - } - return 0; -} diff --git a/mobile/test/operators/test_quantize_op.cpp b/mobile/test/operators/test_quantize_op.cpp deleted file mode 100644 index d8e72e9b14..0000000000 --- a/mobile/test/operators/test_quantize_op.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/quantize_op.h" - -namespace paddle_mobile { -namespace round { -enum RoundType { - RoundToEven = 0, - RoundAwayZero = 1, - RoundTowardsZero = 2, -}; -} - -template -struct Round { - int8_t operator()(float x); -}; - -template <> -struct Round { - int8_t operator()(float x) { return std::round(x); } -}; - -template <> -struct Round { - int8_t operator()(float x) { return int8_t(x); } -}; - -template <> -struct Round { - int8_t operator()(float x) { - float v = std::round(x); - int32_t q = static_cast(v); - if (abs(abs(q - v) - 0.5) <= 0) { - if (abs(q) % 2 != 0) { - q = q + ((q > 0) ? -1 : 1); - } - } - return static_cast(q); - } -}; - -template -static void quantize(const Tensor *input, const float scale, Tensor *output) { - int batch_size = input->dims()[0]; - int channels = input->dims()[1]; - int input_h = input->dims()[2]; - int input_w = input->dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - size_t input_spatial = input_h * input_w; - size_t output_spatial = output_h * output_w; - const float *x = input->data(); - int8_t *y = output->mutable_data(); - - for (int nc = 0; nc < batch_size * channels; ++nc) { - const float *xh = x + nc * input_spatial; - int8_t *yh = y + nc * output_spatial; - for (int h = 0; h < input_h; ++h, yh += output_w, xh += input_w) { - for (int w = 0; w < input_w; ++w) { - yh[w] = Round()(xh[w] * scale); - } - } - } -} - -static float find_abs_max(const Tensor *input) { - float max_abs = 0.f; - const float *x = input->data(); - size_t size = input->numel(); - for (size_t i = 0; i < size; ++i) { - float value = std::abs(x[i]); - if (value > max_abs) { - max_abs = value; - } - } - return max_abs; -} - -int TestQuqntizeOp(const int batch_size, const int channel, const int height, - const int width) { - DLOG << "batch_size: " << batch_size << ", channel: " << channel - << ", height: " << height << ", width: " << width; - framework::DDim dim = - framework::make_ddim({batch_size, channel, height, width}); - - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - outputs["OutScale"] = std::vector({"output_scale"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dim, -100.f, 100.f); - - auto output_var = scope.get()->Var("output"); - auto output_scale_var = scope.get()->Var("output_scale"); - - framework::AttributeMap attrs; - auto *op = new operators::QuantizeOp("quantize", inputs, outputs, - attrs, scope.get()); - op->InferShape(); - op->Run(); - - auto output = output_var->template Get(); - const int8_t *output_data = output->data(); - auto output_scale = output_scale_var->template Get(); - const float *output_scale_data = output_scale->data(); - - float output_scale_cmp = find_abs_max(input); - PADDLE_MOBILE_ENFORCE(output_scale_cmp == output_scale_data[0], - "output_scale = %.6f, output_scale_cmp = %.6f", - output_scale_cmp, output_scale_data[0]); - - framework::Tensor output_cmp; - output_cmp.Resize(output->dims()); - float scale = 127 / output_scale_cmp; - quantize(input, scale, &output_cmp); - int8_t *output_cmp_data = output_cmp.data(); - for (int i = 0; i < output->numel(); ++i) { - PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], - "output[%d] = %d, output_cmp[%d] = %d", i, - static_cast(output_data[i]), i, - static_cast(output_cmp_data[i])); - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main(int argc, char *argv[]) { - TestQuqntizeOp(1, 10, 10, 5); - TestQuqntizeOp(1, 111, 111, 5); - TestQuqntizeOp(5, 111, 111, 5); -} diff --git a/mobile/test/operators/test_relu6_op.cpp b/mobile/test/operators/test_relu6_op.cpp deleted file mode 100644 index 8681c4155d..0000000000 --- a/mobile/test/operators/test_relu6_op.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/activation_op.h" - -namespace paddle_mobile { - -void Relu6(const framework::Tensor *X, framework::Tensor *Y) { - const float *x = X->data(); - float *y = Y->mutable_data(); - - for (int i = 0; i < X->numel(); ++i) { - float q = x[i]; - y[i] = std::min(std::max(0.f, q), 6.f); - } -} - -int TestRelu6Op(const std::vector input_shape) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - attrs["threshold"].Set(6.f); - auto *op = new operators::Relu6Op("relu6", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - Relu6(input, &output_cmp); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestRelu6Op({1, 1, 2, 3}); - paddle_mobile::TestRelu6Op({1, 3, 11, 22}); - paddle_mobile::TestRelu6Op({1, 32, 112, 112}); - std::cout << "test relu6 op pass." << std::endl; - return 0; -} diff --git a/mobile/test/operators/test_relu_op.cpp b/mobile/test/operators/test_relu_op.cpp deleted file mode 100644 index d173845386..0000000000 --- a/mobile/test/operators/test_relu_op.cpp +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/activation_op.h" - -namespace paddle_mobile { - -void Relu(const framework::Tensor *X, framework::Tensor *Y) { - const float *x = X->data(); - float *y = Y->mutable_data(); - - for (int i = 0; i < X->numel(); ++i) { - float q = x[i]; - y[i] = std::max(0.f, q); - } -} - -int TestReluOp(const std::vector input_shape) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - auto *op = new operators::ReluOp("relu", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - Relu(input, &output_cmp); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestReluOp({1, 1, 2, 3}); - paddle_mobile::TestReluOp({1, 3, 11, 22}); - paddle_mobile::TestReluOp({1, 32, 112, 112}); - std::cout << "test relu op pass." << std::endl; - return 0; -} diff --git a/mobile/test/operators/test_reshape2_op.cpp b/mobile/test/operators/test_reshape2_op.cpp deleted file mode 100644 index 69edd34bf6..0000000000 --- a/mobile/test/operators/test_reshape2_op.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/reshape2_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestReshape2Op { - public: - explicit TestReshape2Op(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - const std::vector> blocks = - to_predict_program_->Blocks(); - for (auto block_desc : blocks) { - std::vector> ops = block_desc->Ops(); - for (auto op : ops) { - if (op->Type() == "reshape2") { - DLOG << " attr size: " << op->GetAttrMap().size(); - std::unordered_map attrs = op->GetAttrMap(); - for (std::unordered_map::iterator it = - attrs.begin(); - it != attrs.end(); ++it) { - DLOG << " " << it->first << " " << it->second; - } - - DLOG << " inputs size: " << op->GetInputs().size(); - VariableNameMap inputs = op->GetInputs(); - for (VariableNameMap::iterator it = inputs.begin(); - it != inputs.end(); ++it) { - DLOG << " " << it->first << " " << it->second; - } - - DLOG << " outputs size: " << op->GetOutputs().size(); - VariableNameMap outputs = op->GetOutputs(); - for (VariableNameMap::iterator it = outputs.begin(); - it != outputs.end(); ++it) { - DLOG << " " << it->first << " " << it->second; - } - - input_var_name = op->Input("X")[0]; - output_var_name = op->Output("Out")[0]; - std::shared_ptr> op_ptr = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(op_ptr); - return; - } - } - } - } - - std::shared_ptr predict(const Tensor &t) { - auto scope = program_.scope.get(); - Variable *input_feed_value = scope->Var(input_var_name); - auto tensor_input = input_feed_value->GetMutable(); - tensor_input->ShareDataWith(t); - - Variable *output = scope->Var(output_var_name); - auto *output_tensor = output->GetMutable(); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict(t, 0); - - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - string input_var_name; - string output_var_name; - - void predict(const Tensor &t, int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - op->Run(); - } - } -}; - -template class TestReshape2Op; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run Reshape2 Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_ocr) + "/model", - std::string(g_ocr) + "/params"); - - paddle_mobile::framework::Tensor input; - SetupTensor(&input, {1, 4, 4}, static_cast(0), - static_cast(1)); - auto *input_ptr = input.data(); - for (int i = 0; i < 16; ++i) { - *(input_ptr + i) = i; - } - DLOG << "input : "; - for (int i = 0; i < input.numel(); ++i) { - DLOG << " index " << i << " : " << input_ptr[i]; - } - - paddle_mobile::framework::TestReshape2Op testReshape2Op( - program); - - auto output = testReshape2Op.predict(input); - auto *output_ptr = output->data(); - - DLOG << "output : "; - for (int i = 0; i < output->numel(); ++i) { - DLOG << " index " << i << " : " << output_ptr[i]; - } - return 0; -} diff --git a/mobile/test/operators/test_reshape_op.cpp b/mobile/test/operators/test_reshape_op.cpp deleted file mode 100644 index ff3299f5e8..0000000000 --- a/mobile/test/operators/test_reshape_op.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/reshape_op.h" - -int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_mobilenet_ssd)); - if (program.originProgram == nullptr) { - DLOG << "program read file"; - } - Executor4Test> - executor(program, "reshape"); - paddle_mobile::framework::Tensor input; - SetupTensor(&input, {2, 3, 3, 2}, static_cast(0), - static_cast(1)); - auto input_ptr = input.data(); - auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2}); - auto output = - executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim); - auto *output_ptr = output->data(); - - DLOG << "input : "; - for (int j = 0; j < input.numel(); ++j) { - DLOG << " index " << j << " : " << input_ptr[j]; - } - - DLOG << "output : "; - for (int j = 0; j < output->numel(); ++j) { - DLOG << " index " << j << " : " << output_ptr[j]; - } - - return 0; -} diff --git a/mobile/test/operators/test_resize_op.cpp b/mobile/test/operators/test_resize_op.cpp deleted file mode 100644 index c452ef8d85..0000000000 --- a/mobile/test/operators/test_resize_op.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/resize_op.h" - -int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_mobilenet_ssd)); - if (program.originProgram == nullptr) { - DLOG << "program read file"; - } - Executor4Test> - executor(program, "resize"); - paddle_mobile::framework::Tensor input; - SetupTensor(&input, {2, 3, 3, 2}, static_cast(0), - static_cast(1)); - auto input_ptr = input.data(); - auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2}); - auto output = - executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim); - auto *output_ptr = output->data(); - - DLOG << "input : "; - for (int j = 0; j < input.numel(); ++j) { - DLOG << " index " << j << " : " << input_ptr[j]; - } - - DLOG << "output : "; - for (int j = 0; j < output->numel(); ++j) { - DLOG << " index " << j << " : " << output_ptr[j]; - } - - return 0; -} diff --git a/mobile/test/operators/test_scale_op.cpp b/mobile/test/operators/test_scale_op.cpp deleted file mode 100644 index 574779d71e..0000000000 --- a/mobile/test/operators/test_scale_op.cpp +++ /dev/null @@ -1,18 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/scale_op.h" - -int main() {} diff --git a/mobile/test/operators/test_sequence_expand_op.cpp b/mobile/test/operators/test_sequence_expand_op.cpp deleted file mode 100644 index 731fc8e9e5..0000000000 --- a/mobile/test/operators/test_sequence_expand_op.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_include.h" -#include "operators/sequence_ops/sequence_expand_op.h" - -namespace paddle_mobile { - -int TestSequenceExpandOp(const framework::LoDTensor &input_x, - const framework::LoDTensor &input_y, int ref_level, - framework::LoDTensor *output) { - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input_x"}); - inputs["Y"] = std::vector({"input_y"}); - outputs["Out"] = std::vector({"output"}); - - auto input_x_var = scope.get()->Var("input_x"); - auto *x = input_x_var->template GetMutable(); - x->Resize(input_x.dims()); - x->ShareDataWith(input_x); - x->set_lod(input_x.lod()); - auto input_y_var = scope.get()->Var("input_y"); - auto *y = input_y_var->template GetMutable(); - y->Resize(framework::make_ddim({0})); - y->mutable_data(); - y->set_lod(input_y.lod()); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - attrs["ref_level"].Set(0); - - auto *op = new operators::SequenceExpandOp( - "sequence_expand", inputs, outputs, attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto *out = output_var->template Get(); - output->Resize(out->dims()); - output->ShareDataWith(*out); - output->set_lod(out->lod()); - delete op; - return 0; -} - -} // namespace paddle_mobile - -// namespace framework = paddle_mobile::framework; - -int main(int argc, char *argv[]) { - framework::LoDTensor input_x, input_y, output; - // case 1 - { - std::vector data{1, 2, 3, 4}; - input_x.Resize(framework::make_ddim({4, 1})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < 4; ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - input_y.set_lod({{0, 2, 4}, {0, 3, 6, 7, 8}}); - - TestSequenceExpandOp(input_x, input_y, 0, &output); - std::vector expect_data{1, 2, 1, 2, 3, 4, 3, 4}; - std::vector expect_lod{0, 2, 4, 6, 8}; - for (int i = 0; i < 5; ++i) { - if (output.lod()[0][i] != expect_lod[i]) { - std::cerr << "output_lod[" << i << "]: " << output.lod()[0][i] - << " != expect_lod[" << i << "]: " << expect_lod[i] - << std::endl; - return 1; - } - } - for (int i = 0; i < 8; ++i) { - if (output.data()[i] != expect_data[i]) { - std::cerr << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i] << std::endl; - return 1; - } - } - } - return 0; -} diff --git a/mobile/test/operators/test_sequence_pool_op.cpp b/mobile/test/operators/test_sequence_pool_op.cpp deleted file mode 100644 index de945c9ec0..0000000000 --- a/mobile/test/operators/test_sequence_pool_op.cpp +++ /dev/null @@ -1,293 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_include.h" -#include "operators/sequence_ops/sequence_pool_op.h" - -namespace paddle_mobile { - -int TestSequencePoolOp(const framework::LoDTensor &input_x, - const std::string pool_type, - framework::LoDTensor *output) { - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input_x"}); - outputs["Out"] = std::vector({"output"}); - - auto input_x_var = scope.get()->Var("input_x"); - auto *x = input_x_var->template GetMutable(); - x->Resize(input_x.dims()); - x->ShareDataWith(input_x); - x->set_lod(input_x.lod()); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - attrs["pooltype"].Set(pool_type); - - auto *op = new operators::SequencePoolOp( - "sequence_pool", inputs, outputs, attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto *out = output_var->template Get(); - output->Resize(out->dims()); - output->ShareDataWith(*out); - delete op; - return 0; -} - -} // namespace paddle_mobile - -// namespace framework = paddle_mobile::framework; - -int main(int argc, char *argv[]) { - framework::LoDTensor input_x, output; - // case 1 - DLOG << "running max case 1"; - { - std::vector data{1, 2, 3, 4}; - input_x.Resize(framework::make_ddim({4, 1})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < 4; ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "MAX", &output); - std::vector expect_data{2, 4}; - for (int i = 0; i < 2; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 2 - DLOG << "running max case 2"; - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - input_x.Resize(framework::make_ddim({data.size(), 1})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 3, 10}}); - - TestSequencePoolOp(input_x, "MAX", &output); - std::vector expect_data{3, 10}; - for (int i = 0; i < 2; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - DLOG << "running max case 3"; - // case 3 - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8}; - input_x.Resize(framework::make_ddim({4, 2})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "MAX", &output); - std::vector expect_data{3, 4, 7, 8}; - for (int i = 0; i < 4; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 4 - DLOG << "running max case 4"; - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; - input_x.Resize(framework::make_ddim({4, 5})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "MAX", &output); - std::vector expect_data{6, 7, 8, 9, 10, 16, 17, 18, 19, 20}; - for (int i = 0; i < 10; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 1 - DLOG << "running sum case 1"; - { - std::vector data{1, 2, 3, 4}; - input_x.Resize(framework::make_ddim({4, 1})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < 4; ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "SUM", &output); - std::vector expect_data{3, 7}; - for (int i = 0; i < 2; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 2 - DLOG << "running sum case 2"; - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - input_x.Resize(framework::make_ddim({data.size(), 1})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 3, 10}}); - - TestSequencePoolOp(input_x, "SUM", &output); - std::vector expect_data{6, 49}; - for (int i = 0; i < 2; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 3 - DLOG << "running sum case 3"; - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8}; - input_x.Resize(framework::make_ddim({4, 2})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "SUM", &output); - std::vector expect_data{4, 6, 12, 14}; - for (int i = 0; i < 4; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 4 - DLOG << "running sum case 4"; - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; - input_x.Resize(framework::make_ddim({4, 5})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "SUM", &output); - std::vector expect_data{7, 9, 11, 13, 15, 27, 29, 31, 33, 35}; - for (int i = 0; i < 10; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 1 - DLOG << "running first case 1"; - { - std::vector data{1, 2, 3, 4}; - input_x.Resize(framework::make_ddim({4, 1})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < 4; ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "FIRST", &output); - std::vector expect_data{1, 3}; - for (int i = 0; i < 2; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 2 - DLOG << "running first case 2"; - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - input_x.Resize(framework::make_ddim({data.size(), 1})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 3, 10}}); - - TestSequencePoolOp(input_x, "FIRST", &output); - std::vector expect_data{1, 4}; - for (int i = 0; i < 2; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 3 - DLOG << "running first case 3"; - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8}; - input_x.Resize(framework::make_ddim({4, 2})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "FIRST", &output); - std::vector expect_data{1, 2, 5, 6}; - for (int i = 0; i < 4; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - // case 4 - DLOG << "running first case 4"; - { - std::vector data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; - input_x.Resize(framework::make_ddim({4, 5})); - float *in_data = input_x.mutable_data(); - for (int i = 0; i < data.size(); ++i) in_data[i] = data[i]; - input_x.set_lod({{0, 2, 4}}); - - TestSequencePoolOp(input_x, "FIRST", &output); - std::vector expect_data{1, 2, 3, 4, 5, 11, 12, 13, 14, 15}; - for (int i = 0; i < 10; ++i) { - if (output.data()[i] != expect_data[i]) { - DLOG << "output[" << i << "]: " << output.data()[i] - << " != expect[" << i << "]: " << expect_data[i]; - return 1; - } - } - } - return 0; -} diff --git a/mobile/test/operators/test_sequence_softmax_op.cpp b/mobile/test/operators/test_sequence_softmax_op.cpp deleted file mode 100644 index d8e67f456f..0000000000 --- a/mobile/test/operators/test_sequence_softmax_op.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/sequence_ops/sequence_softmax_op.h" - -namespace paddle_mobile { - -void SequenceSoftmax(const framework::LoDTensor *X, framework::LoDTensor *Y) { - const float *x = X->data(); - const auto &lod = X->lod().back(); - float *y = Y->mutable_data(); - for (int batch = 0; batch < lod.size() - 1; ++batch) { - int num_classes = lod[batch + 1] - lod[batch]; - size_t offset = lod[batch]; - const float *input = x + offset; - float *output = y + offset; - float max = -std::numeric_limits::max(); - for (int j = 0; j < num_classes; ++j) { - max = (input[j] > max) ? input[j] : max; - } - float sum = 0.f; - for (int j = 0; j < num_classes; ++j) { - float tmp = expf(input[j] - max); - sum += tmp; - output[j] = tmp; - } - for (int j = 0; j < num_classes; ++j) { - output[j] /= sum; - } - } - Y->set_lod(X->lod()); -} - -int TestSequenceSoftmaxOp(const std::vector &input_shape, - const std::vector &input_lod) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - input->set_lod({input_lod}); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - auto *op = new operators::SequenceSoftmaxOp( - "sequence_softmax", inputs, outputs, attrs, scope.get()); - - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::LoDTensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - SequenceSoftmax(input, &output_cmp); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main(int argc, char *argv[]) { - TestSequenceSoftmaxOp({2, 1}, {0, 2}); - TestSequenceSoftmaxOp({100, 1}, {0, 3, 100}); - TestSequenceSoftmaxOp({100, 1}, {0, 50, 100}); - return 0; -} diff --git a/mobile/test/operators/test_sigmoid_op.cpp b/mobile/test/operators/test_sigmoid_op.cpp deleted file mode 100644 index bda7a79d94..0000000000 --- a/mobile/test/operators/test_sigmoid_op.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/activation_op.h" - -namespace paddle_mobile { - -void Sigmoid(const framework::Tensor *X, framework::Tensor *Y) { - const float *x = X->data(); - float *y = Y->mutable_data(); - - for (int i = 0; i < X->numel(); ++i) { - y[i] = 1.f / (1.f + exp(-x[i])); - } -} - -int TestSigmoidOp(const std::vector input_shape) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - auto *op = new operators::SigmoidOp("sigmoid", inputs, outputs, - attrs, scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - Sigmoid(input, &output_cmp); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestSigmoidOp({1, 1, 2, 3}); - paddle_mobile::TestSigmoidOp({1, 3, 11, 22}); - paddle_mobile::TestSigmoidOp({1, 32, 112, 112}); - return 0; -} diff --git a/mobile/test/operators/test_slice_op.cpp b/mobile/test/operators/test_slice_op.cpp deleted file mode 100644 index 9306bc53c6..0000000000 --- a/mobile/test/operators/test_slice_op.cpp +++ /dev/null @@ -1,18 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/slice_op.h" - -int main() {} diff --git a/mobile/test/operators/test_softmax_op.cpp b/mobile/test/operators/test_softmax_op.cpp deleted file mode 100644 index e9ccb260b5..0000000000 --- a/mobile/test/operators/test_softmax_op.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/softmax_op.h" - -namespace paddle_mobile { - -void Softmax(const framework::Tensor *X, framework::Tensor *Y) { - const framework::DDim &dims = X->dims(); - int batch_size = dims[0]; - int num_classes = dims[dims.size() - 1]; - int channels = X->numel() / batch_size / num_classes; - const float *x = X->data(); - float *y = Y->mutable_data(); - - for (int batch = 0; batch < batch_size; ++batch) { - for (int c = 0; c < channels; ++c) { - size_t offset = (batch * channels + c) * num_classes; - const float *input = x + offset; - float *output = y + offset; - float max = -std::numeric_limits::max(); - for (int j = 0; j < num_classes; ++j) { - max = (input[j] > max) ? input[j] : max; - } - float sum = 0.f; - for (int j = 0; j < num_classes; ++j) { - float tmp = expf(input[j] - max); - sum += tmp; - output[j] = tmp; - } - for (int j = 0; j < num_classes; ++j) { - output[j] /= sum; - } - } - } -} - -int TestSoftmaxOp(const std::vector input_shape) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - auto *op = new operators::SoftmaxOp("softmax", inputs, outputs, - attrs, scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - Softmax(input, &output_cmp); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main(int argc, char *argv[]) { - TestSoftmaxOp({128, 1000}); - TestSoftmaxOp({128, 10, 1000}); - return 0; -} diff --git a/mobile/test/operators/test_sum_op.cpp b/mobile/test/operators/test_sum_op.cpp deleted file mode 100644 index 225a113f90..0000000000 --- a/mobile/test/operators/test_sum_op.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/sum_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestSumOp { - public: - explicit TestSumOp(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - - const std::vector> blocks = - to_predict_program_->Blocks(); - // DLOG << " **block size " << blocks.size(); - for (int i = 0; i < blocks.size(); ++i) { - std::shared_ptr block_desc = blocks[i]; - std::vector> ops = block_desc->Ops(); - // DLOG << " ops " << ops.size(); - for (int j = 0; j < ops.size(); ++j) { - std::shared_ptr op = ops[j]; - if (op->Type() == "sum" && op->Input("X")[0] == "fc_2.tmp_0") { - DLOG << " sum attr size: " << op->GetAttrMap().size(); - DLOG << " inputs size: " << op->GetInputs().size(); - DLOG << " outputs size: " << op->GetOutputs().size(); - - std::shared_ptr> lrn = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(lrn); - } - } - } - } - - std::shared_ptr predict_bn(const Tensor &t1, const Tensor &t2) { - // feed - auto scope = program_.scope.get(); - Variable *x1_feed_value = scope->Var("fc_2.tmp_0"); - auto tensor_x1 = x1_feed_value->GetMutable(); - tensor_x1->ShareDataWith(t1); - - Variable *x2_feed_value = scope->Var("fc_2.tmp_1"); - auto tensor_x2 = x2_feed_value->GetMutable(); - tensor_x2->ShareDataWith(t2); - - Variable *output = scope->Var("fc_2.tmp_2"); - auto *output_tensor = output->GetMutable(); - output_tensor->mutable_data({2, 96}); - // DLOG << typeid(output_tensor).name(); - // DLOG << "output_tensor dims: " << output_tensor->dims(); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict_bn(t1, t2, 0); - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - - void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - DLOG << "op -> run()"; - op->Run(); - } - } -}; - -template class TestSumOp; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run Sum Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_eng) + "/model", - std::string(g_eng) + "/params"); - - /// input x (4,10,2,2) - paddle_mobile::framework::Tensor inputx1; - SetupTensor(&inputx1, {2, 96}, static_cast(0), - static_cast(1)); - auto *inputx1_ptr = inputx1.data(); - - paddle_mobile::framework::Tensor inputx2; - SetupTensor(&inputx2, {2, 96}, static_cast(0), - static_cast(1)); - auto *inputx2_ptr = inputx2.data(); - - paddle_mobile::framework::TestSumOp testSumOp(program); - - auto output_sum = testSumOp.predict_bn(inputx1, inputx2); - auto *output_sum_ptr = output_sum->data(); - - DLOG << "input1 44: " << inputx1_ptr[44]; - DLOG << "input2 44: " << inputx2_ptr[44]; - DLOG << "out 44 :" << output_sum_ptr[44]; - - return 0; -} diff --git a/mobile/test/operators/test_tanh_op.cpp b/mobile/test/operators/test_tanh_op.cpp deleted file mode 100644 index 13dfd09b3b..0000000000 --- a/mobile/test/operators/test_tanh_op.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/activation_op.h" - -namespace paddle_mobile { - -void Tanh(const framework::Tensor *X, framework::Tensor *Y) { - const float *x = X->data(); - float *y = Y->mutable_data(); - - for (int i = 0; i < X->numel(); ++i) { - y[i] = 2.f / (1.f + exp(-2.f * x[i])) - 1.f; - } -} - -int TestTanhOp(const std::vector input_shape) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - - auto output_var = scope.get()->Var("output"); - - framework::AttributeMap attrs; - auto *op = new operators::TanhOp("tanh", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - - framework::Tensor output_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - Tanh(input, &output_cmp); - - const float *output_data = output->data(); - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main() { - paddle_mobile::TestTanhOp({1, 1, 2, 3}); - paddle_mobile::TestTanhOp({1, 3, 11, 22}); - paddle_mobile::TestTanhOp({1, 32, 112, 112}); - std::cout << "test sigmoid op pass." << std::endl; - return 0; -} diff --git a/mobile/test/operators/test_topk_op.cpp b/mobile/test/operators/test_topk_op.cpp deleted file mode 100644 index cf0fde3705..0000000000 --- a/mobile/test/operators/test_topk_op.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "../test_include.h" -#include "operators/top_k_op.h" - -namespace paddle_mobile { - -void TopK(const framework::Tensor *X, framework::Tensor *Y, - framework::Tensor *Indices, const int K) { - const float *x = X->data(); - float *y = Y->mutable_data(); - int64_t *indices = Indices->mutable_data(); - - int dim_size = X->dims().size(); - int row = 1; - int col = X->dims()[dim_size - 1]; - for (int i = 0; i < dim_size - 1; ++i) { - row *= X->dims()[i]; - } - - std::vector vec(col); - for (int i = 0; i < row; ++i) { - for (int j = 0; j < col; ++j) { - vec[j] = x[i * col + j]; - } - for (int k = 0; k < K; ++k) { - float max = vec[0]; - int index = 0; - for (int j = 1; j < col; ++j) { - if (vec[j] > max) { - max = vec[j]; - index = j; - } - } - y[i * K + k] = max; - indices[i * K + k] = index; - vec[index] = -std::numeric_limits::max(); - } - } -} - -int TestTopKOp(const std::vector input_shape, const int K) { - framework::DDim dims = framework::make_ddim(input_shape); - VariableNameMap inputs; - VariableNameMap outputs; - auto scope = std::make_shared(); - inputs["X"] = std::vector({"input"}); - outputs["Out"] = std::vector({"output"}); - outputs["Indices"] = std::vector({"indices"}); - - auto input_var = scope.get()->Var("input"); - auto input = input_var->template GetMutable(); - SetupTensor(input, dims, -100.0, 100.0); - - auto output_var = scope.get()->Var("output"); - auto indices_var = scope.get()->Var("indices"); - - framework::AttributeMap attrs; - attrs["k"].Set(K); - auto *op = new operators::TopKOp("top_k", inputs, outputs, attrs, - scope.get()); - op->InferShape(); - op->Init(); - op->Run(); - - auto output = output_var->template Get(); - auto indices = indices_var->template Get(); - - framework::Tensor output_cmp, indices_cmp; - float *output_cmp_data = output_cmp.mutable_data(output->dims()); - int64_t *indices_cmp_data = - indices_cmp.mutable_data(indices->dims()); - TopK(input, &output_cmp, &indices_cmp, K); - - // sort output - float *output_data = const_cast(output->data()); - int64_t *indices_data = const_cast(indices->data()); - // std::vector> vec(K); - // for (int i = 0; i < output->numel() / K; ++i) { - // for (int j = 0; j < K; ++j) { - // vec[j] = std::move(std::make_pair(output_data[i * K + j], - // indices_data[i * K + j])); - // } - // std::sort(vec.begin(), vec.end(), - // [](const std::pair &l, - // const std::pair &r) { - // return l.first > r.first; }); - // for (int j = 0; j < K; ++j) { - // output_data[i * K + j] = vec[j].first; - // indices_data[i * K + j] = vec[j].second; - // } - // } - - for (int i = 0; i < output->numel(); ++i) { - float gap = output_data[i] - output_cmp_data[i]; - if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { - LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] - << ", output_cmp_data[" << i - << "] = " << output_cmp_data[i]; - delete op; - exit(1); - } - } - - for (int i = 0; i < indices->numel(); ++i) { - if (indices_data[i] != indices_cmp_data[i]) { - LOG(kLOG_INFO) << "indices_data[" << i << "] = " << indices_data[i] - << ", indices_cmp_data[" << i - << "] = " << indices_cmp_data[i]; - delete op; - exit(1); - } - } - delete op; - return 0; -} - -} // namespace paddle_mobile - -int main(int argc, char *argv[]) { - TestTopKOp({1, 100}, 1); - TestTopKOp({128, 100}, 10); - TestTopKOp({128, 2, 100}, 10); - return 0; -} diff --git a/mobile/test/operators/test_transpose2_op.cpp b/mobile/test/operators/test_transpose2_op.cpp deleted file mode 100644 index 4c4f5e4c26..0000000000 --- a/mobile/test/operators/test_transpose2_op.cpp +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_include.h" -#include "operators/transpose2_op.h" - -namespace paddle_mobile { -namespace framework { - -template -class TestTranspose2Op { - public: - explicit TestTranspose2Op(const Program p) : program_(p) { - if (use_optimize_) { - to_predict_program_ = program_.optimizeProgram; - } else { - to_predict_program_ = program_.originProgram; - } - const std::vector> blocks = - to_predict_program_->Blocks(); - for (auto block_desc : blocks) { - std::vector> ops = block_desc->Ops(); - for (auto op : ops) { - if (op->Type() == "transpose2") { - DLOG << " attr size: " << op->GetAttrMap().size(); - std::unordered_map attrs = op->GetAttrMap(); - for (std::unordered_map::iterator it = - attrs.begin(); - it != attrs.end(); ++it) { - DLOG << " " << it->first << " " << it->second; - } - - DLOG << " inputs size: " << op->GetInputs().size(); - VariableNameMap inputs = op->GetInputs(); - for (VariableNameMap::iterator it = inputs.begin(); - it != inputs.end(); ++it) { - DLOG << " " << it->first << " " << it->second; - } - - DLOG << " outputs size: " << op->GetOutputs().size(); - VariableNameMap outputs = op->GetOutputs(); - for (VariableNameMap::iterator it = outputs.begin(); - it != outputs.end(); ++it) { - DLOG << " " << it->first << " " << it->second; - } - - input_var_name = op->Input("X")[0]; - output_var_name = op->Output("Out")[0]; - std::shared_ptr> op_ptr = - std::make_shared>( - op->Type(), op->GetInputs(), op->GetOutputs(), - op->GetAttrMap(), program_.scope.get()); - ops_of_block_[*block_desc.get()].push_back(op_ptr); - return; - } - } - } - } - - std::shared_ptr predict(const Tensor &t) { - auto scope = program_.scope.get(); - Variable *input_feed_value = scope->Var(input_var_name); - auto tensor_input = input_feed_value->GetMutable(); - tensor_input->ShareDataWith(t); - - Variable *output = scope->Var(output_var_name); - auto *output_tensor = output->GetMutable(); - output_tensor->mutable_data({1, 2, 8}); - - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - - predict(t, 0); - - return out_tensor; - } - - private: - const framework::Program program_; - std::shared_ptr to_predict_program_; - std::map>>> - ops_of_block_; - bool use_optimize_ = false; - string input_var_name; - string output_var_name; - - void predict(const Tensor &t, int block_id) { - std::shared_ptr to_predict_block = - to_predict_program_->Block(block_id); - for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { - auto op = ops_of_block_[*to_predict_block.get()][j]; - op->Run(); - } - } -}; - -template class TestTranspose2Op; -} // namespace framework -} // namespace paddle_mobile - -int main() { - DLOG << "----------**********----------"; - DLOG << "begin to run Transpose2 Test"; - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_ocr) + "/model", - std::string(g_ocr) + "/params"); - - paddle_mobile::framework::Tensor input; - SetupTensor(&input, {1, 8, 2}, static_cast(0), - static_cast(1)); - auto *input_ptr = input.data(); - for (int i = 0; i < 16; ++i) { - *(input_ptr + i) = i; - } - DLOG << "input : "; - for (int i = 0; i < input.numel(); ++i) { - DLOG << " index " << i << " : " << input_ptr[i]; - } - - paddle_mobile::framework::TestTranspose2Op - testTranspose2Op(program); - - auto output = testTranspose2Op.predict(input); - auto *output_ptr = output->data(); - - DLOG << "output : "; - for (int i = 0; i < output->numel(); ++i) { - DLOG << " index " << i << " : " << output_ptr[i]; - } - return 0; -} diff --git a/mobile/test/operators/test_transpose_op.cpp b/mobile/test/operators/test_transpose_op.cpp deleted file mode 100644 index 263fdcfa0e..0000000000 --- a/mobile/test/operators/test_transpose_op.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "../test_helper.h" -#include "../test_include.h" -#include "operators/transpose_op.h" -int main() { - paddle_mobile::framework::Loader loader; - auto program = loader.Load(std::string(g_mobilenet_ssd)); - if (program.originProgram == nullptr) { - DLOG << "program read file"; - } - Executor4Test> - executor(program, "transpose"); - paddle_mobile::framework::Tensor input; - SetupTensor(&input, {1, 2, 3, 4}, static_cast(0), - static_cast(1)); - auto input_ptr = input.data(); - auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 4, 2}); - auto output = - executor.Predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim); - auto *output_ptr = output->data(); - - DLOG << "input : "; - for (int j = 0; j < input.numel(); ++j) { - DLOG << " index " << j << " : " << input_ptr[j]; - } - - DLOG << "output : "; - for (int j = 0; j < output->numel(); ++j) { - DLOG << " index " << j << " : " << output_ptr[j]; - } - DLOG << " for example : "; - DLOG << " you can check if input[16] == output[9] "; - DLOG << " you can check if input[12] == output[1] "; - return 0; -} diff --git a/mobile/test/test_helper.h b/mobile/test/test_helper.h deleted file mode 100644 index 98893eeac0..0000000000 --- a/mobile/test/test_helper.h +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "common/common.h" -#include "common/log.h" -#include "framework/ddim.h" -#include "framework/lod_tensor.h" - -static const char *g_ocr = "../models/ocr"; -static const char *g_mobilenet_ssd = "../models/mobilenet+ssd"; -static const char *g_genet_combine = "../models/enet"; -static const char *g_eng = "../models/eng_20conv_1_9_fc"; -static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture"; -static const char *g_mobilenet_combined = "../models/mobilenet_combine"; -static const char *g_googlenetv1_combined = "../models/googlenetv1_combine"; -static const char *g_mobilenet_detect = "../models/mobilenet-detect"; -static const char *g_squeezenet = "../models/squeezenet"; -static const char *g_googlenet = "../models/googlenet"; -static const char *g_googlenet_quali = "../models/googlenet_combine_quali"; -static const char *g_mobilenet = "../models/mobilenet"; -static const char *g_mobilenet_mul = "../models/r"; -static const char *g_alexnet = "../models/alexnet"; -static const char *g_inceptionv4 = "../models/inceptionv4"; -static const char *g_inceptionv3 = - "../models/InceptionV3_Spatial_Attention_Model"; -static const char *g_nlp = "../models/nlp"; -static const char *g_super = "../models/superresoltion"; -static const char *g_superv2 = "../models/superv2"; -static const char *g_resnet_50 = "../models/resnet_50"; -static const char *g_resnet = "../models/resnet"; -static const char *g_googlenet_combine = "../models/googlenet_combine"; -static const char *g_yolo = "../models/yolo"; -static const char *g_yolo_combined = "../models/yolo_combined"; -static const char *g_yolo_mul = "../models/d"; -static const char *g_fluid_fssd_new = "../models/fluid_fssd_new"; -static const char *g_vgg16_ssd_combined = "../models/vgg16_ssd_combined"; -static const char *g_mobilenet_vision = "../models/vision_mobilenet"; -static const char *g_yolo_vision = "../models/vision_yolo"; -static const char *g_test_image_1x3x224x224 = - "../images/test_image_1x3x224x224_float"; -static const char *g_test_image_1x3x224x224_banana = - "../images/input_3x224x224_banana"; -static const char *g_test_image_desktop_1_3_416_416_nchw_float = - "../images/in_put_1_3_416_416_2"; -static const char *g_hand = "../images/hand_image"; -static const char *g_moto = "../images/moto_300x300_float"; -static const char *g_imgfssd_ar = "../images/test_image_ssd_ar"; -static const char *g_imgfssd_ar1 = "../images/003_0001.txt"; -static const char *g_img = "../images/img.bin"; -static const char *g_yolo_img = "../images/in_put_1_3_416_416_2"; -static const char *g_super_img = "../images/mingren_input_data"; -static const char *g_mobilenet_img = "../images/image"; -static const char *g_test_image_1x3x224x224_vision_mobilenet_input = - "../images/vision_mobilenet_input"; -static const char *g_test_image_1x3x416x416_vision_yolo_input = - "../images/yolo_input"; - -using namespace paddle_mobile; // NOLINT -using paddle_mobile::framework::DDim; -using paddle_mobile::framework::LoDTensor; -using paddle_mobile::framework::Tensor; - -template -void SetupTensor(paddle_mobile::framework::Tensor *input, - paddle_mobile::framework::DDim dims, T lower, T upper) { - static unsigned int seed = 100; - std::mt19937 rng(seed++); - std::uniform_real_distribution uniform_dist(0, 1); - - T *input_ptr = input->mutable_data(dims); - for (int i = 0; i < input->numel(); ++i) { - input_ptr[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); - } -} - -template <> -void SetupTensor(paddle_mobile::framework::Tensor *input, - paddle_mobile::framework::DDim dims, bool lower, - bool upper) { - static unsigned int seed = 100; - std::mt19937 rng(seed++); - std::uniform_real_distribution uniform_dist(0, 1); - - bool *input_ptr = input->mutable_data(dims); - if (lower == upper) { - for (int i = 0; i < input->numel(); ++i) { - input_ptr[i] = lower; - } - } else { - for (int i = 0; i < input->numel(); ++i) { - input_ptr[i] = uniform_dist(rng) > 0.5; - } - } -} - -template -T *CreateInput(Tensor *input, DDim dims, T low, T up) { - SetupTensor(input, dims, static_cast(low), static_cast(up)); - return input->data(); -} - -template -void GetInput(const std::string &input_name, std::vector *input, - const std::vector &dims) { - int size = 1; - for (const auto &dim : dims) { - size *= dim; - } - - T *input_ptr = reinterpret_cast(malloc(sizeof(T) * size)); - std::ifstream in(input_name, std::ios::in | std::ios::binary); - in.read(reinterpret_cast(input_ptr), size * sizeof(T)); - in.close(); - for (int i = 0; i < size; ++i) { - input->push_back(input_ptr[i]); - } - free(input_ptr); -} - -template -void GetInput(const std::string &input_name, - paddle_mobile::framework::Tensor *input, - paddle_mobile::framework::DDim dims) { - T *input_ptr = input->mutable_data(dims); - - std::ifstream in(input_name, std::ios::in | std::ios::binary); - in.read(reinterpret_cast(input_ptr), input->numel() * sizeof(T)); - in.close(); -} diff --git a/mobile/test/test_include.h b/mobile/test/test_include.h deleted file mode 100644 index cce946848c..0000000000 --- a/mobile/test/test_include.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "./test_helper.h" -#include "common/enforce.h" -#include "common/log.h" -#include "common/util.h" -#include "executor_for_test.h" -#include "framework/ddim.h" -#include "framework/lod_tensor.h" -#include "framework/operator.h" -#include "framework/program/block_desc.h" -#include "framework/program/program.h" -#include "framework/program/program_desc.h" -#include "framework/scope.h" -#include "framework/tensor.h" -#include "framework/variable.h" -#include "io/paddle_mobile.h" - -#ifdef PADDLE_MOBILE_CL -#include "framework/cl/cl_image.h" -#endif diff --git a/mobile/third_party/opencl/.gitinore b/mobile/third_party/opencl/.gitinore deleted file mode 100644 index 0c27d54300..0000000000 --- a/mobile/third_party/opencl/.gitinore +++ /dev/null @@ -1 +0,0 @@ -OpenCL-Headers diff --git a/mobile/tools/android-cmake/android.toolchain.cmake b/mobile/tools/android-cmake/android.toolchain.cmake deleted file mode 100644 index b897a473d9..0000000000 --- a/mobile/tools/android-cmake/android.toolchain.cmake +++ /dev/null @@ -1,784 +0,0 @@ -# Copyright (C) 2016 The Android Open Source Project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Configurable variables. -# Modeled after the ndk-build system. -# For any variables defined in: -# https://developer.android.com/ndk/guides/android_mk.html -# https://developer.android.com/ndk/guides/application_mk.html -# if it makes sense for CMake, then replace LOCAL, APP, or NDK with ANDROID, and -# we have that variable below. -# The exception is ANDROID_TOOLCHAIN vs NDK_TOOLCHAIN_VERSION. -# Since we only have one version of each gcc and clang, specifying a version -# doesn't make much sense. -# -# ANDROID_TOOLCHAIN -# ANDROID_ABI -# ANDROID_PLATFORM -# ANDROID_STL -# ANDROID_PIE -# ANDROID_CPP_FEATURES -# ANDROID_ALLOW_UNDEFINED_SYMBOLS -# ANDROID_ARM_MODE -# ANDROID_ARM_NEON -# ANDROID_DISABLE_NO_EXECUTE -# ANDROID_DISABLE_RELRO -# ANDROID_DISABLE_FORMAT_STRING_CHECKS -# ANDROID_CCACHE - -# cmake_minimum_required(VERSION 3.6.0) - -# Inhibit all of CMake's own NDK handling code. -set(CMAKE_SYSTEM_VERSION 1) - -# CMake invokes the toolchain file twice during the first build, but only once -# during subsequent rebuilds. This was causing the various flags to be added -# twice on the first build, and on a rebuild ninja would see only one set of the -# flags and rebuild the world. -# https://github.com/android-ndk/ndk/issues/323 -if(ANDROID_NDK_TOOLCHAIN_INCLUDED) - return() -endif(ANDROID_NDK_TOOLCHAIN_INCLUDED) -set(ANDROID_NDK_TOOLCHAIN_INCLUDED true) - -# Android NDK -if(NOT ANDROID_NDK) - get_filename_component(ANDROID_NDK "$ENV{NDK_ROOT}" ABSOLUTE) -else() - # Allow the user to specify their own NDK path, but emit a warning. This is an - # uncommon use case, but helpful if users want to use a bleeding edge - # toolchain file with a stable NDK. - # https://github.com/android-ndk/ndk/issues/473 - message(WARNING "Using custom NDK path (ANDROID_NDK is set): ${ANDROID_NDK}") -endif() -file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK) - -# Android NDK revision -message("${ANDROID_NDK}") - -file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES) -set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX - "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.") -if(NOT ANDROID_NDK_SOURCE_PROPERTIES MATCHES "${ANDROID_NDK_SOURCE_PROPERTIES_REGEX}") - message(SEND_ERROR "Failed to parse Android NDK revision: ${ANDROID_NDK}/source.properties.\n${ANDROID_NDK_SOURCE_PROPERTIES}") -endif() -string(REGEX REPLACE "${ANDROID_NDK_SOURCE_PROPERTIES_REGEX}" "\\1" - ANDROID_NDK_REVISION "${ANDROID_NDK_SOURCE_PROPERTIES}") - -# Touch toolchain variable to suppress "unused variable" warning. -# This happens if CMake is invoked with the same command line the second time. -if(CMAKE_TOOLCHAIN_FILE) -endif() - -# Compatibility for configurable variables. -# Compatible with configurable variables from the other toolchain file: -# https://github.com/taka-no-me/android-cmake -# TODO: We should consider dropping compatibility to simplify things once most -# of our users have migrated to our standard set of configurable variables. -if(ANDROID_TOOLCHAIN_NAME AND NOT ANDROID_TOOLCHAIN) - if(ANDROID_TOOLCHAIN_NAME MATCHES "-clang([0-9].[0-9])?$") - set(ANDROID_TOOLCHAIN clang) - elseif(ANDROID_TOOLCHAIN_NAME MATCHES "-[0-9].[0-9]$") - set(ANDROID_TOOLCHAIN gcc) - endif() -endif() -if(ANDROID_ABI STREQUAL "armeabi-v7a with NEON") - set(ANDROID_ABI armeabi-v7a) - set(ANDROID_ARM_NEON TRUE) -elseif(ANDROID_TOOLCHAIN_NAME AND NOT ANDROID_ABI) - if(ANDROID_TOOLCHAIN_NAME MATCHES "^arm-linux-androideabi-") - set(ANDROID_ABI armeabi-v7a) - elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^aarch64-linux-android-") - set(ANDROID_ABI arm64-v8a) - elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^x86-") - set(ANDROID_ABI x86) - elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^x86_64-") - set(ANDROID_ABI x86_64) - elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^mipsel-linux-android-") - set(ANDROID_ABI mips) - elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^mips64el-linux-android-") - set(ANDROID_ABI mips64) - endif() -endif() -if(ANDROID_NATIVE_API_LEVEL AND NOT ANDROID_PLATFORM) - if(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$") - set(ANDROID_PLATFORM ${ANDROID_NATIVE_API_LEVEL}) - elseif(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$") - set(ANDROID_PLATFORM android-${ANDROID_NATIVE_API_LEVEL}) - endif() -endif() -if(DEFINED ANDROID_APP_PIE AND NOT DEFINED ANDROID_PIE) - set(ANDROID_PIE "${ANDROID_APP_PIE}") -endif() -if(ANDROID_STL_FORCE_FEATURES AND NOT DEFINED ANDROID_CPP_FEATURES) - set(ANDROID_CPP_FEATURES "rtti exceptions") -endif() -if(DEFINED ANDROID_NO_UNDEFINED AND NOT DEFINED ANDROID_ALLOW_UNDEFINED_SYMBOLS) - if(ANDROID_NO_UNDEFINED) - set(ANDROID_ALLOW_UNDEFINED_SYMBOLS FALSE) - else() - set(ANDROID_ALLOW_UNDEFINED_SYMBOLS TRUE) - endif() -endif() -if(DEFINED ANDROID_SO_UNDEFINED AND NOT DEFINED ANDROID_ALLOW_UNDEFINED_SYMBOLS) - set(ANDROID_ALLOW_UNDEFINED_SYMBOLS "${ANDROID_SO_UNDEFINED}") -endif() -if(DEFINED ANDROID_FORCE_ARM_BUILD AND NOT ANDROID_ARM_MODE) - if(ANDROID_FORCE_ARM_BUILD) - set(ANDROID_ARM_MODE arm) - else() - set(ANDROID_ARM_MODE thumb) - endif() -endif() -if(DEFINED ANDROID_NOEXECSTACK AND NOT DEFINED ANDROID_DISABLE_NO_EXECUTE) - if(ANDROID_NOEXECSTACK) - set(ANDROID_DISABLE_NO_EXECUTE FALSE) - else() - set(ANDROID_DISABLE_NO_EXECUTE TRUE) - endif() -endif() -if(DEFINED ANDROID_RELRO AND NOT DEFINED ANDROID_DISABLE_RELRO) - if(ANDROID_RELRO) - set(ANDROID_DISABLE_RELRO FALSE) - else() - set(ANDROID_DISABLE_RELRO TRUE) - endif() -endif() -if(NDK_CCACHE AND NOT ANDROID_CCACHE) - set(ANDROID_CCACHE "${NDK_CCACHE}") -endif() - -# Default values for configurable variables. -if(NOT ANDROID_TOOLCHAIN) - set(ANDROID_TOOLCHAIN gcc) -endif() -if(NOT ANDROID_ABI) - set(ANDROID_ABI armeabi-v7a) -endif() -if(ANDROID_PLATFORM MATCHES "^android-([0-9]|1[0-3])$") - set(ANDROID_PLATFORM android-14) -elseif(ANDROID_PLATFORM STREQUAL android-20) - set(ANDROID_PLATFORM android-19) -elseif(ANDROID_PLATFORM STREQUAL android-25) - set(ANDROID_PLATFORM android-24) -elseif(NOT ANDROID_PLATFORM) - set(ANDROID_PLATFORM android-14) -endif() -string(REPLACE "android-" "" ANDROID_PLATFORM_LEVEL ${ANDROID_PLATFORM}) -if(ANDROID_ABI MATCHES "64(-v8a)?$" AND ANDROID_PLATFORM_LEVEL LESS 21) - set(ANDROID_PLATFORM android-21) - set(ANDROID_PLATFORM_LEVEL 21) -endif() -if(NOT ANDROID_STL) - set(ANDROID_STL gnustl_static) -endif() -if(NOT DEFINED ANDROID_PIE) - if(ANDROID_PLATFORM_LEVEL LESS 16) - set(ANDROID_PIE FALSE) - else() - set(ANDROID_PIE TRUE) - endif() -endif() -if(NOT ANDROID_ARM_MODE) - set(ANDROID_ARM_MODE thumb) -endif() - -# Export configurable variables for the try_compile() command. -set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES - ANDROID_TOOLCHAIN - ANDROID_ABI - ANDROID_PLATFORM - ANDROID_STL - ANDROID_PIE - ANDROID_CPP_FEATURES - ANDROID_ALLOW_UNDEFINED_SYMBOLS - ANDROID_ARM_MODE - ANDROID_ARM_NEON - ANDROID_DISABLE_NO_EXECUTE - ANDROID_DISABLE_RELRO - ANDROID_DISABLE_FORMAT_STRING_CHECKS - ANDROID_CCACHE) - -# Standard cross-compiling stuff. -set(ANDROID TRUE) -set(CMAKE_SYSTEM_NAME Android) - -# Allow users to override these values in case they want more strict behaviors. -# For example, they may want to prevent the NDK's libz from being picked up so -# they can use their own. -# https://github.com/android-ndk/ndk/issues/517 -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -endif() - -if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -endif() - -if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -endif() - -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) -endif() - -# ABI. -set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI}) -if(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - set(ANDROID_SYSROOT_ABI arm) - set(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) - set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME}) - set(ANDROID_HEADER_TRIPLE arm-linux-androideabi) - if(ANDROID_ABI STREQUAL armeabi) - message(WARNING "armeabi is deprecated and will be removed in a future NDK " - "release.") - set(CMAKE_SYSTEM_PROCESSOR armv5te) - set(ANDROID_LLVM_TRIPLE armv5te-none-linux-androideabi) - elseif(ANDROID_ABI STREQUAL armeabi-v7a) - set(CMAKE_SYSTEM_PROCESSOR armv7-a) - set(ANDROID_LLVM_TRIPLE armv7-none-linux-androideabi) - endif() -elseif(ANDROID_ABI STREQUAL arm64-v8a) - set(ANDROID_SYSROOT_ABI arm64) - set(CMAKE_SYSTEM_PROCESSOR aarch64) - set(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) - set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME}) - set(ANDROID_LLVM_TRIPLE aarch64-none-linux-android) - set(ANDROID_HEADER_TRIPLE aarch64-linux-android) -elseif(ANDROID_ABI STREQUAL x86) - set(ANDROID_SYSROOT_ABI x86) - set(CMAKE_SYSTEM_PROCESSOR i686) - set(ANDROID_TOOLCHAIN_NAME i686-linux-android) - set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_ABI}) - set(ANDROID_LLVM_TRIPLE i686-none-linux-android) - set(ANDROID_HEADER_TRIPLE i686-linux-android) -elseif(ANDROID_ABI STREQUAL x86_64) - set(ANDROID_SYSROOT_ABI x86_64) - set(CMAKE_SYSTEM_PROCESSOR x86_64) - set(ANDROID_TOOLCHAIN_NAME x86_64-linux-android) - set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_ABI}) - set(ANDROID_LLVM_TRIPLE x86_64-none-linux-android) - set(ANDROID_HEADER_TRIPLE x86_64-linux-android) -elseif(ANDROID_ABI STREQUAL mips) - message(WARNING "mips is deprecated and will be removed in a future NDK " - "release.") - set(ANDROID_SYSROOT_ABI mips) - set(CMAKE_SYSTEM_PROCESSOR mips) - set(ANDROID_TOOLCHAIN_NAME mips64el-linux-android) - set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME}) - set(ANDROID_LLVM_TRIPLE mipsel-none-linux-android) - set(ANDROID_HEADER_TRIPLE mipsel-linux-android) -elseif(ANDROID_ABI STREQUAL mips64) - message(WARNING "mips64 is deprecated and will be removed in a future NDK " - "release.") - set(ANDROID_SYSROOT_ABI mips64) - set(CMAKE_SYSTEM_PROCESSOR mips64) - set(ANDROID_TOOLCHAIN_NAME mips64el-linux-android) - set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME}) - set(ANDROID_LLVM_TRIPLE mips64el-none-linux-android) - set(ANDROID_HEADER_TRIPLE mips64el-linux-android) -else() - message(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.") -endif() - -set(ANDROID_COMPILER_FLAGS) -set(ANDROID_COMPILER_FLAGS_CXX) -set(ANDROID_COMPILER_FLAGS_DEBUG) -set(ANDROID_COMPILER_FLAGS_RELEASE) -set(ANDROID_LINKER_FLAGS) -set(ANDROID_LINKER_FLAGS_EXE) - -# Don't re-export libgcc symbols in every binary. -list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libgcc.a) -list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libatomic.a) - -# STL. -set(ANDROID_STL_STATIC_LIBRARIES) -set(ANDROID_STL_SHARED_LIBRARIES) -if(ANDROID_STL STREQUAL system) - if(NOT "x${ANDROID_CPP_FEATURES}" STREQUAL "x") - set(ANDROID_STL_STATIC_LIBRARIES supc++) - endif() -elseif(ANDROID_STL STREQUAL stlport_static) - set(ANDROID_STL_STATIC_LIBRARIES stlport_static) -elseif(ANDROID_STL STREQUAL stlport_shared) - set(ANDROID_STL_SHARED_LIBRARIES stlport_shared) -elseif(ANDROID_STL STREQUAL gnustl_static) - set(ANDROID_STL_STATIC_LIBRARIES gnustl_static) -elseif(ANDROID_STL STREQUAL gnustl_shared) - set(ANDROID_STL_STATIC_LIBRARIES supc++) - set(ANDROID_STL_SHARED_LIBRARIES gnustl_shared) -elseif(ANDROID_STL STREQUAL c++_static) - set(ANDROID_STL_STATIC_LIBRARIES c++) -elseif(ANDROID_STL STREQUAL c++_shared) - set(ANDROID_STL_SHARED_LIBRARIES c++) -elseif(ANDROID_STL STREQUAL none) -else() - message(FATAL_ERROR "Invalid Android STL: ${ANDROID_STL}.") -endif() - -# Behavior of CMAKE_SYSTEM_LIBRARY_PATH and CMAKE_LIBRARY_PATH are really weird -# when CMAKE_SYSROOT is set. The library path is appended to the sysroot even if -# the library path is an abspath. Using a relative path from the sysroot doesn't -# work either, because the relative path is abspath'd relative to the current -# CMakeLists.txt file before being appended :( -# -# We can try to get out of this problem by providing another root path for cmake -# to check. CMAKE_FIND_ROOT_PATH is intended for this purpose: -# https://cmake.org/cmake/help/v3.8/variable/CMAKE_FIND_ROOT_PATH.html -# -# In theory this should just be our sysroot, but since we don't have a single -# sysroot that is correct (there's only one set of headers, but multiple -# locations for libraries that need to be handled differently). Some day we'll -# want to move all the libraries into ${ANDROID_NDK}/sysroot, but we'll need to -# make some fixes to Clang, various build systems, and possibly CMake itself to -# get that working. -list(APPEND CMAKE_FIND_ROOT_PATH "${ANDROID_NDK}") - -# Sysroot. -set(CMAKE_SYSROOT "${ANDROID_NDK}/sysroot") - -# CMake 3.9 tries to use CMAKE_SYSROOT_COMPILE before it gets set from -# CMAKE_SYSROOT, which leads to using the system's /usr/include. Set this -# manually. -# https://github.com/android-ndk/ndk/issues/467 -set(CMAKE_SYSROOT_COMPILE "${CMAKE_SYSROOT}") - -# The compiler driver doesn't check any arch specific include locations (though -# maybe we should add that). Architecture specific headers like asm/ and -# machine/ are installed to an arch-$ARCH subdirectory of the sysroot. -list(APPEND ANDROID_COMPILER_FLAGS - "-isystem ${CMAKE_SYSROOT}/usr/include/${ANDROID_HEADER_TRIPLE}") -list(APPEND ANDROID_COMPILER_FLAGS - "-D__ANDROID_API__=${ANDROID_PLATFORM_LEVEL}") - -# We need different sysroots for linking and compiling, but cmake doesn't -# support that. Pass the sysroot flag manually when linking. -set(ANDROID_SYSTEM_LIBRARY_PATH - "${ANDROID_NDK}/platforms/${ANDROID_PLATFORM}/arch-${ANDROID_SYSROOT_ABI}") -list(APPEND ANDROID_LINKER_FLAGS "--sysroot ${ANDROID_SYSTEM_LIBRARY_PATH}") - -# find_library searches a handful of paths as described by -# https://cmake.org/cmake/help/v3.6/command/find_library.html. Since libraries -# are per-API level and headers aren't, We don't have libraries in the -# CMAKE_SYSROOT. Set up CMAKE_SYSTEM_LIBRARY_PATH -# (https://cmake.org/cmake/help/v3.6/variable/CMAKE_SYSTEM_LIBRARY_PATH.html) -# instead. -# -# NB: The suffix is just lib here instead of dealing with lib64 because -# apparently CMake does some automatic rewriting of that? I've been testing by -# building my own CMake with a bunch of logging added, and that seems to be the -# case. -list(APPEND CMAKE_SYSTEM_LIBRARY_PATH - "${ANDROID_SYSTEM_LIBRARY_PATH}/usr/lib") - -# Toolchain. -if(CMAKE_HOST_SYSTEM_NAME STREQUAL Linux) - set(ANDROID_HOST_TAG linux-x86_64) -elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin) - set(ANDROID_HOST_TAG darwin-x86_64) -elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows) - set(ANDROID_HOST_TAG windows-x86_64) -endif() -set(ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_ROOT}-4.9/prebuilt/${ANDROID_HOST_TAG}") -set(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") -if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows) - set(ANDROID_TOOLCHAIN_SUFFIX .exe) -endif() - -set(ANDROID_HOST_PREBUILTS "${ANDROID_NDK}/prebuilt/${ANDROID_HOST_TAG}") - -if(ANDROID_TOOLCHAIN STREQUAL clang) - set(ANDROID_LLVM_TOOLCHAIN_PREFIX "${ANDROID_NDK}/toolchains/llvm/prebuilt/${ANDROID_HOST_TAG}/bin/") - set(ANDROID_C_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang${ANDROID_TOOLCHAIN_SUFFIX}") - set(ANDROID_CXX_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang++${ANDROID_TOOLCHAIN_SUFFIX}") - set(ANDROID_ASM_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang${ANDROID_TOOLCHAIN_SUFFIX}") - # Clang can fail to compile if CMake doesn't correctly supply the target and - # external toolchain, but to do so, CMake needs to already know that the - # compiler is clang. Tell CMake that the compiler is really clang, but don't - # use CMakeForceCompiler, since we still want compile checks. We only want - # to skip the compiler ID detection step. - set(CMAKE_C_COMPILER_ID_RUN TRUE) - set(CMAKE_CXX_COMPILER_ID_RUN TRUE) - set(CMAKE_C_COMPILER_ID Clang) - set(CMAKE_CXX_COMPILER_ID Clang) - set(CMAKE_C_COMPILER_VERSION 3.8) - set(CMAKE_CXX_COMPILER_VERSION 3.8) - set(CMAKE_C_STANDARD_COMPUTED_DEFAULT 11) - set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT 98) - set(CMAKE_C_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE}) - set(CMAKE_CXX_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE}) - set(CMAKE_ASM_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE}) - set(CMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}") - set(CMAKE_CXX_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}") - set(CMAKE_ASM_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}") - set(ANDROID_AR "${ANDROID_TOOLCHAIN_PREFIX}ar${ANDROID_TOOLCHAIN_SUFFIX}") - set(ANDROID_RANLIB "${ANDROID_TOOLCHAIN_PREFIX}ranlib${ANDROID_TOOLCHAIN_SUFFIX}") -elseif(ANDROID_TOOLCHAIN STREQUAL gcc) - set(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc${ANDROID_TOOLCHAIN_SUFFIX}") - set(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++${ANDROID_TOOLCHAIN_SUFFIX}") - set(ANDROID_ASM_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc${ANDROID_TOOLCHAIN_SUFFIX}") - set(ANDROID_AR "${ANDROID_TOOLCHAIN_PREFIX}gcc-ar${ANDROID_TOOLCHAIN_SUFFIX}") - set(ANDROID_RANLIB "${ANDROID_TOOLCHAIN_PREFIX}gcc-ranlib${ANDROID_TOOLCHAIN_SUFFIX}") -else() - message(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}.") -endif() - -if(NOT IS_DIRECTORY "${ANDROID_NDK}/platforms/${ANDROID_PLATFORM}") - message(FATAL_ERROR "Invalid Android platform: ${ANDROID_PLATFORM}.") -elseif(NOT IS_DIRECTORY "${CMAKE_SYSROOT}") - message(FATAL_ERROR "Invalid Android sysroot: ${CMAKE_SYSROOT}.") -endif() - -# Generic flags. -list(APPEND ANDROID_COMPILER_FLAGS -# -g - -DANDROID - -ffunction-sections - -funwind-tables - -fstack-protector-strong - -no-canonical-prefixes) -list(APPEND ANDROID_LINKER_FLAGS - -Wl,--build-id - -Wl,--warn-shared-textrel - -Wl,--fatal-warnings) -list(APPEND ANDROID_LINKER_FLAGS_EXE - -Wl,--gc-sections - -Wl,-z,nocopyreloc) - -# Debug and release flags. -list(APPEND ANDROID_COMPILER_FLAGS_DEBUG -O0) -if(ANDROID_ABI MATCHES "^armeabi") - list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -Os) -else() - list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -O2) -endif() -list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -DNDEBUG) -if(ANDROID_TOOLCHAIN STREQUAL clang) - list(APPEND ANDROID_COMPILER_FLAGS_DEBUG -fno-limit-debug-info) -endif() - -# Toolchain and ABI specific flags. -if(ANDROID_ABI STREQUAL armeabi) - list(APPEND ANDROID_COMPILER_FLAGS - -march=armv5te - -mtune=xscale - -msoft-float) -endif() -if(ANDROID_ABI STREQUAL armeabi-v7a) - list(APPEND ANDROID_COMPILER_FLAGS - -march=armv7-a - -mfloat-abi=softfp - -mfpu=vfpv3-d16) - list(APPEND ANDROID_LINKER_FLAGS - -Wl,--fix-cortex-a8) -endif() -if(ANDROID_ABI STREQUAL mips) - list(APPEND ANDROID_COMPILER_FLAGS - -mips32) -endif() -if(ANDROID_ABI STREQUAL "mips64" AND ANDROID_TOOLCHAIN STREQUAL clang) - list(APPEND ANDROID_COMPILER_FLAGS "-fintegrated-as") -endif() -if(ANDROID_ABI MATCHES "^armeabi" AND ANDROID_TOOLCHAIN STREQUAL clang) - # Disable integrated-as for better compatibility. - list(APPEND ANDROID_COMPILER_FLAGS - -fno-integrated-as) -endif() -if(ANDROID_ABI STREQUAL mips AND ANDROID_TOOLCHAIN STREQUAL clang) - # Help clang use mips64el multilib GCC - list(APPEND ANDROID_LINKER_FLAGS - "\"-L${ANDROID_TOOLCHAIN_ROOT}/lib/gcc/${ANDROID_TOOLCHAIN_NAME}/4.9.x/32/mips-r1\"") -endif() -if(ANDROID_ABI STREQUAL x86) - # http://b.android.com/222239 - # http://b.android.com/220159 (internal http://b/31809417) - # x86 devices have stack alignment issues. - list(APPEND ANDROID_COMPILER_FLAGS -mstackrealign) -endif() - -# STL specific flags. -if(ANDROID_STL STREQUAL system) - set(ANDROID_STL_PREFIX gnu-libstdc++/4.9) - set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES - "${ANDROID_NDK}/sources/cxx-stl/system/include") -elseif(ANDROID_STL MATCHES "^stlport_") - set(ANDROID_STL_PREFIX stlport) - set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES - "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/stlport" - "${ANDROID_NDK}/sources/cxx-stl/gabi++/include") -elseif(ANDROID_STL MATCHES "^gnustl_") - set(ANDROID_STL_PREFIX gnu-libstdc++/4.9) - set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES - "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include" - "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/include" - "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include/backward") -elseif(ANDROID_STL MATCHES "^c\\+\\+_") - set(ANDROID_STL_PREFIX llvm-libc++) - if(ANDROID_ABI MATCHES "^armeabi") - list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libunwind.a) - endif() - list(APPEND ANDROID_COMPILER_FLAGS_CXX - -std=c++11) - if(ANDROID_TOOLCHAIN STREQUAL gcc) - list(APPEND ANDROID_COMPILER_FLAGS_CXX - -fno-strict-aliasing) - endif() - - # Add the libc++ lib directory to the path so the linker scripts can pick up - # the extra libraries. - list(APPEND ANDROID_LINKER_FLAGS - "-L${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}") - - set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES - "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include" - "${ANDROID_NDK}/sources/android/support/include" - "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}abi/include") -endif() -set(ANDROID_CXX_STANDARD_LIBRARIES) -foreach(library ${ANDROID_STL_STATIC_LIBRARIES}) - list(APPEND ANDROID_CXX_STANDARD_LIBRARIES - "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/lib${library}.a") -endforeach() -foreach(library ${ANDROID_STL_SHARED_LIBRARIES}) - list(APPEND ANDROID_CXX_STANDARD_LIBRARIES - "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/lib${library}.so") -endforeach() -set(CMAKE_C_STANDARD_LIBRARIES_INIT "-latomic -lm") -set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_C_STANDARD_LIBRARIES_INIT}") -if(ANDROID_CXX_STANDARD_LIBRARIES) - string(REPLACE ";" "\" \"" ANDROID_CXX_STANDARD_LIBRARIES "\"${ANDROID_CXX_STANDARD_LIBRARIES}\"") - set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_CXX_STANDARD_LIBRARIES}") -endif() - -# Configuration specific flags. -if(ANDROID_PIE) - set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) - list(APPEND ANDROID_LINKER_FLAGS_EXE - -pie - -fPIE) -endif() -if(ANDROID_CPP_FEATURES) - separate_arguments(ANDROID_CPP_FEATURES) - foreach(feature ${ANDROID_CPP_FEATURES}) - if(NOT ${feature} MATCHES "^(rtti|exceptions)$") - message(FATAL_ERROR "Invalid Android C++ feature: ${feature}.") - endif() - list(APPEND ANDROID_COMPILER_FLAGS_CXX - -f${feature}) - endforeach() - string(REPLACE ";" " " ANDROID_CPP_FEATURES "${ANDROID_CPP_FEATURES}") -endif() -if(NOT ANDROID_ALLOW_UNDEFINED_SYMBOLS) - list(APPEND ANDROID_LINKER_FLAGS - -Wl,--no-undefined) -endif() -if(ANDROID_ABI MATCHES "armeabi") - if(ANDROID_ARM_MODE STREQUAL thumb) - list(APPEND ANDROID_COMPILER_FLAGS - -mthumb) - elseif(ANDROID_ARM_MODE STREQUAL arm) - list(APPEND ANDROID_COMPILER_FLAGS - -marm) - else() - message(FATAL_ERROR "Invalid Android ARM mode: ${ANDROID_ARM_MODE}.") - endif() - if(ANDROID_ABI STREQUAL armeabi-v7a AND ANDROID_ARM_NEON) - list(APPEND ANDROID_COMPILER_FLAGS - -mfpu=neon) - endif() -endif() -if(ANDROID_DISABLE_NO_EXECUTE) - list(APPEND ANDROID_COMPILER_FLAGS - -Wa,--execstack) - list(APPEND ANDROID_LINKER_FLAGS - -Wl,-z,execstack) -else() - list(APPEND ANDROID_COMPILER_FLAGS - -Wa,--noexecstack) - list(APPEND ANDROID_LINKER_FLAGS - -Wl,-z,noexecstack) -endif() -if(ANDROID_TOOLCHAIN STREQUAL clang) - # CMake automatically forwards all compiler flags to the linker, - # and clang doesn't like having -Wa flags being used for linking. - # To prevent CMake from doing this would require meddling with - # the CMAKE__COMPILE_OBJECT rules, which would get quite messy. - list(APPEND ANDROID_LINKER_FLAGS - -Qunused-arguments) -endif() -if(ANDROID_DISABLE_RELRO) - list(APPEND ANDROID_LINKER_FLAGS - -Wl,-z,norelro -Wl,-z,lazy) -else() - list(APPEND ANDROID_LINKER_FLAGS - -Wl,-z,relro -Wl,-z,now) -endif() -if(ANDROID_DISABLE_FORMAT_STRING_CHECKS) - list(APPEND ANDROID_COMPILER_FLAGS - -Wno-error=format-security) -else() - list(APPEND ANDROID_COMPILER_FLAGS - -Wformat -Werror=format-security) -endif() - -# Convert these lists into strings. -string(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") -string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_CXX "${ANDROID_COMPILER_FLAGS_CXX}") -string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_DEBUG "${ANDROID_COMPILER_FLAGS_DEBUG}") -string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE}") -string(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}") -string(REPLACE ";" " " ANDROID_LINKER_FLAGS_EXE "${ANDROID_LINKER_FLAGS_EXE}") - -if(ANDROID_CCACHE) - set(CMAKE_C_COMPILER_LAUNCHER "${ANDROID_CCACHE}") - set(CMAKE_CXX_COMPILER_LAUNCHER "${ANDROID_CCACHE}") -endif() -set(CMAKE_C_COMPILER "${ANDROID_C_COMPILER}") -set(CMAKE_CXX_COMPILER "${ANDROID_CXX_COMPILER}") -set(CMAKE_AR "${ANDROID_AR}" CACHE FILEPATH "Archiver") -set(CMAKE_RANLIB "${ANDROID_RANLIB}" CACHE FILEPATH "Ranlib") -set(_CMAKE_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_PREFIX}") - -if(ANDROID_ABI STREQUAL "x86" OR ANDROID_ABI STREQUAL "x86_64") - set(CMAKE_ASM_NASM_COMPILER - "${ANDROID_HOST_PREBUILTS}/bin/yasm${ANDROID_TOOLCHAIN_SUFFIX}") - set(CMAKE_ASM_NASM_COMPILER_ARG1 "-DELF") -endif() - -# Set or retrieve the cached flags. -# This is necessary in case the user sets/changes flags in subsequent -# configures. If we included the Android flags in here, they would get -# overwritten. -set(CMAKE_C_FLAGS "" - CACHE STRING "Flags used by the compiler during all build types.") -set(CMAKE_CXX_FLAGS "" - CACHE STRING "Flags used by the compiler during all build types.") -set(CMAKE_ASM_FLAGS "" - CACHE STRING "Flags used by the compiler during all build types.") -set(CMAKE_C_FLAGS_DEBUG "" - CACHE STRING "Flags used by the compiler during debug builds.") -set(CMAKE_CXX_FLAGS_DEBUG "" - CACHE STRING "Flags used by the compiler during debug builds.") -set(CMAKE_ASM_FLAGS_DEBUG "" - CACHE STRING "Flags used by the compiler during debug builds.") -set(CMAKE_C_FLAGS_RELEASE "" - CACHE STRING "Flags used by the compiler during release builds.") -set(CMAKE_CXX_FLAGS_RELEASE "" - CACHE STRING "Flags used by the compiler during release builds.") -set(CMAKE_ASM_FLAGS_RELEASE "" - CACHE STRING "Flags used by the compiler during release builds.") -set(CMAKE_MODULE_LINKER_FLAGS "" - CACHE STRING "Flags used by the linker during the creation of modules.") -set(CMAKE_SHARED_LINKER_FLAGS "" - CACHE STRING "Flags used by the linker during the creation of dll's.") -set(CMAKE_EXE_LINKER_FLAGS "" - CACHE STRING "Flags used by the linker.") - -set(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}") -set(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${ANDROID_COMPILER_FLAGS_CXX} ${CMAKE_CXX_FLAGS}") -set(CMAKE_ASM_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_ASM_FLAGS}") -set(CMAKE_C_FLAGS_DEBUG "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}") -set(CMAKE_CXX_FLAGS_DEBUG "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}") -set(CMAKE_ASM_FLAGS_DEBUG "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_ASM_FLAGS_DEBUG}") -set(CMAKE_C_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}") -set(CMAKE_CXX_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}") -set(CMAKE_ASM_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_ASM_FLAGS_RELEASE}") -set(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}") -set(CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}") -set(CMAKE_EXE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${ANDROID_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}") - -# Compatibility for read-only variables. -# Read-only variables for compatibility with the other toolchain file. -# We'll keep these around for the existing projects that still use them. -# TODO: All of the variables here have equivalents in our standard set of -# configurable variables, so we can remove these once most of our users migrate -# to those variables. -set(ANDROID_NATIVE_API_LEVEL ${ANDROID_PLATFORM_LEVEL}) -if(ANDROID_ALLOW_UNDEFINED_SYMBOLS) - set(ANDROID_SO_UNDEFINED TRUE) -else() - set(ANDROID_NO_UNDEFINED TRUE) -endif() -set(ANDROID_FUNCTION_LEVEL_LINKING TRUE) -set(ANDROID_GOLD_LINKER TRUE) -if(NOT ANDROID_DISABLE_NO_EXECUTE) - set(ANDROID_NOEXECSTACK TRUE) -endif() -if(NOT ANDROID_DISABLE_RELRO) - set(ANDROID_RELRO TRUE) -endif() -if(ANDROID_ARM_MODE STREQUAL arm) - set(ANDROID_FORCE_ARM_BUILD TRUE) -endif() -if(ANDROID_CPP_FEATURES MATCHES "rtti" - AND ANDROID_CPP_FEATURES MATCHES "exceptions") - set(ANDROID_STL_FORCE_FEATURES TRUE) -endif() -if(ANDROID_CCACHE) - set(NDK_CCACHE "${ANDROID_CCACHE}") -endif() -if(ANDROID_TOOLCHAIN STREQUAL clang) - set(ANDROID_TOOLCHAIN_NAME ${ANDROID_TOOLCHAIN_NAME}-clang) -else() - set(ANDROID_TOOLCHAIN_NAME ${ANDROID_TOOLCHAIN_NAME}-4.9) -endif() -set(ANDROID_NDK_HOST_X64 TRUE) -set(ANDROID_NDK_LAYOUT RELEASE) -if(ANDROID_ABI STREQUAL armeabi) - set(ARMEABI TRUE) -elseif(ANDROID_ABI STREQUAL armeabi-v7a) - set(ARMEABI_V7A TRUE) - if(ANDROID_ARM_NEON) - set(NEON TRUE) - endif() -elseif(ANDROID_ABI STREQUAL arm64-v8a) - set(ARM64_V8A TRUE) -elseif(ANDROID_ABI STREQUAL x86) - set(X86 TRUE) -elseif(ANDROID_ABI STREQUAL x86_64) - set(X86_64 TRUE) -elseif(ANDROID_ABI STREQUAL mips) - set(MIPS TRUE) -elseif(ANDROID_ABI STREQUAL mips64) - set(MIPS64 TRUE) -endif() -set(ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_HOST_TAG}) -set(ANDROID_NDK_ABI_NAME ${ANDROID_ABI}) -set(ANDROID_NDK_RELEASE r${ANDROID_NDK_REVISION}) -set(ANDROID_ARCH_NAME ${ANDROID_SYSROOT_ABI}) -set(ANDROID_SYSROOT "${CMAKE_SYSROOT}") -set(TOOL_OS_SUFFIX ${ANDROID_TOOLCHAIN_SUFFIX}) -if(ANDROID_TOOLCHAIN STREQUAL clang) - set(ANDROID_COMPILER_IS_CLANG TRUE) -endif() - -# CMake 3.7+ compatibility. -if (CMAKE_VERSION VERSION_GREATER 3.7.0) - set(CMAKE_ANDROID_NDK ${ANDROID_NDK}) - - if(ANDROID_TOOLCHAIN STREQUAL gcc) - set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION 4.9) - else() - set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION clang) - endif() - - set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL}) - - if(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - set(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) - set(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) - endif() -endif() diff --git a/mobile/tools/android-debug-script/push2android.sh b/mobile/tools/android-debug-script/push2android.sh deleted file mode 100644 index a367bb6a29..0000000000 --- a/mobile/tools/android-debug-script/push2android.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env sh - -push_fn () { -MODELS_PATH="../../test/models/*" -MODELS_SRC="../../test/models" -IMAGE_PATH="../../test/images/*" -EXE_FILE="../../test/build/*" -EXE_DIR="/data/local/tmp/bin" -adb shell mkdir ${EXE_DIR} -MODELS_DIR="/data/local/tmp/models" -adb shell mkdir ${MODELS_DIR} -for file in `ls ${MODELS_SRC}` -do - adb shell mkdir ${MODELS_DIR}"/"${file} -done - -if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then -ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*" -adb push ${ACL_BUILD_PATH} ${EXE_DIR} -fi - -IMAGES_DIR="/data/local/tmp/images" -adb shell mkdir ${IMAGES_DIR} -LIB_PATH="../../build/release/arm-v7a/build/*" -#LIB_PATH="../../build/release/arm-v8a/build/*" -adb push ${EXE_FILE} ${EXE_DIR} -for file in ${LIB_PATH} -do - adb push ${file} ${EXE_DIR} -done - -if [[ $1 != "npm" ]]; then -adb push ${IMAGE_PATH} ${IMAGES_DIR} -adb push ${MODELS_PATH} ${MODELS_DIR} -fi -} - -if [[ $1 == "npm" ]]; then -push_fn $1 -else -push_fn -fi diff --git a/mobile/tools/android-debug-script/run_on_android.sh b/mobile/tools/android-debug-script/run_on_android.sh deleted file mode 100644 index cb5a634860..0000000000 --- a/mobile/tools/android-debug-script/run_on_android.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env sh - -push_fn () { -MODELS_PATH="../../test/models/*" -MODELS_SRC="../../test/models" -IMAGE_PATH="../../test/images/*" -EXE_FILE="../../test/build/*" -EXE_DIR="data/local/tmp/bin" -adb shell mkdir ${EXE_DIR} -MODELS_DIR="data/local/tmp/models" -adb shell mkdir ${MODELS_DIR} -for file in `ls ${MODELS_SRC}` -do - adb shell mkdir ${MODELS_DIR}"/"${file} -done - -IMAGES_DIR="data/local/tmp/images" -adb shell mkdir ${IMAGES_DIR} -LIB_PATH="../../build/release/arm-v7a/build/*" -adb push ${EXE_FILE} ${EXE_DIR} -adb push ${LIB_PATH} ${EXE_DIR} -if [[ $1 != "npm" ]]; then -adb push ${IMAGE_PATH} ${IMAGES_DIR} -adb push ${MODELS_PATH} ${MODELS_DIR} -fi -echo "test-op or test-net below : " -adb shell ls /data/local/tmp/bin -echo "**** choose OP or NET to test ****" -read -p "which to test : " test_name -adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}" -} - -if [[ $1 == "npm" ]]; then -push_fn $1 -else -push_fn -fi diff --git a/mobile/tools/arm-platform.cmake b/mobile/tools/arm-platform.cmake deleted file mode 100644 index 9f2b6d5e89..0000000000 --- a/mobile/tools/arm-platform.cmake +++ /dev/null @@ -1,9 +0,0 @@ - -set(ARCH "armv7-a") - -set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen") -set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard") - -set(FPU "neon") - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}") diff --git a/mobile/tools/build.sh b/mobile/tools/build.sh deleted file mode 100755 index 3dc579ecf0..0000000000 --- a/mobile/tools/build.sh +++ /dev/null @@ -1,242 +0,0 @@ -#!/usr/bin/env bash -NETS="" -declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet" "super" "op") - -# merge cl to so -merge_cl_to_so=1 -opencl_kernels="opencl_kernels.cpp" -cd ../src/operators/kernel/cl -if [[ -f "${opencl_kernels}" ]]; then - rm "${opencl_kernels}" -fi -python gen_code.py "${merge_cl_to_so}" > "${opencl_kernels}" -cd - - -# get cl headers -opencl_header_dir="../third_party/opencl/OpenCL-Headers" -commit_id="320d7189b3e0e7b6a8fc5c10334c79ef364b5ef6" -if [[ -d "$opencl_header_dir" && -d "$opencl_header_dir/.git" ]]; then - echo "pulling opencl headers" - cd $opencl_header_dir - git stash - git pull - git checkout $commit_id - cd - -else - echo "cloning opencl headers" - rm -rf $opencl_header_dir - git clone https://github.com/KhronosGroup/OpenCL-Headers $opencl_header_dir - git checkout $commit_id -fi - -build_for_mac() { - if [ ! `which brew` ]; then - echo "building failed! homebrew not found, please install homebrew." - return - fi - if [ ! `which cmake` ]; then - echo "installing cmake." - brew install cmake - if [ ! $? ]; then - echo "cmake install failed." - return - fi - fi - PLATFORM="x86" - MODE="Release" - BUILD_DIR=../build/release/"${PLATFORM}" - mkdir -p ${BUILD_DIR}/build - - mkdir -p ${BUILD_DIR}/test - cp -r ../test/models ${BUILD_DIR}/test/models - - cmake .. \ - -B"${BUILD_DIR}" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DIS_MAC=true - - cd ${BUILD_DIR} - make -j 8 -} - -build_for_android() { - # rm -rf "../build" - if [ -z "${NDK_ROOT}" ]; then - echo "NDK_ROOT not found!" - exit -1 - fi - - if [ -z "$PLATFORM" ]; then - PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform. - # PLATFORM="arm-v8a" - fi - - if [ "${PLATFORM}" = "arm-v7a" ]; then - ABI="armeabi-v7a with NEON" - ARM_PLATFORM="V7" - CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security" - elif [ "${PLATFORM}" = "arm-v8a" ]; then - ABI="arm64-v8a" - ARM_PLATFORM="V8" - CXX_FLAGS="-march=armv8-a -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold" - else - echo "unknown platform!" - exit -1 - fi - - - MODE="Release" - ANDROID_PLATFORM_VERSION="android-19" - TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" - ANDROID_ARM_MODE="arm" - - if [ "${#NETS}" -gt 1 ]; then - cmake .. \ - -B"../build/release/${PLATFORM}" \ - -DANDROID_ABI="${ABI}" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ - -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \ - -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ - -DANDROID_STL=c++_static \ - -DANDROID=true \ - -DNET="${NETS}" \ - -D"${ARM_PLATFORM}"=true - else - - cmake .. \ - -B"../build/release/${PLATFORM}" \ - -DANDROID_ABI="${ABI}" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ - -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \ - -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ - -DANDROID_STL=c++_static \ - -DANDROID=true \ - -D"${ARM_PLATFORM}"=true - fi - cd "../build/release/${PLATFORM}" - make -j 8 - mkdir ./build/cl_kernel - cp ../../../src/operators/kernel/cl/cl_kernel/* ./build/cl_kernel/ -} - -build_for_arm_linux() { - MODE="Release" - ARM_LINUX="arm-linux" - - if [ "${#NETS}" -gt 1 ]; then - cmake .. \ - -B"../build/release/arm-linux" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \ - -DCMAKE_CXX_FLAGS=" " \ - -DNET="${NETS}" \ - -D"V7"=true - else - cmake .. \ - -B"../build/release/arm-linux" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \ - -DCMAKE_CXX_FLAGS=" " \ - -DNET="${NETS}" \ - -D"V7"=true - fi - - cd "../build/release/arm-linux" - make -j 2 - - cd "../../../test/" - DIRECTORY="models" - if [ "`ls -A $DIRECTORY`" = "" ]; then - echo "$DIRECTORY is indeed empty pull images" - wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip - unzip paddle-mobile%2FmodelsAndImages.zip - mv modelsAndImages/images/ images - mv modelsAndImages/models/ models - rm -rf paddle-mobile%2FmodelsAndImages.zip - rm -rf __MACOS - else - echo "$DIRECTORY is indeed not empty, DONE!" - fi - -} - -build_for_ios() { -# rm -rf "../build" - PLATFORM="ios" - MODE="Release" - BUILD_DIR=../build/release/"${PLATFORM}"/ - TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" - mkdir -p "${BUILD_DIR}" - if [ "${#NETS}" -gt 1 ]; then - cmake .. \ - -B"${BUILD_DIR}" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DIOS_PLATFORM=OS \ - -DIOS_ARCH="${IOS_ARCH}" \ - -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ - -DNET="${NETS}" \ - -DIS_IOS="true" - else - cmake .. \ - -B"${BUILD_DIR}" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DIOS_PLATFORM=OS \ - -DIOS_ARCH="${IOS_ARCH}" \ - -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ - -DIS_IOS="true" - fi - cd "${BUILD_DIR}" - make -j 8 - cp ../../../src/io/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h - cd ./build - # 生成符号表 - ranlib *.a -} - -build_error() { - echo "unknown target : $1" -} - -if [ $# -lt 1 ]; then - echo "error: target missing!" - echo "available targets: ios|android" - echo "sample usage: ./build.sh android" -else - params=($@) - for(( i=1; i<$#; i++ )); do - if [ ${i} != 1 ]; then - NETS=$NETS$";" - fi - NETS=$NETS$"${params[i]}" - done - params=${@:2} - - supported=false - for name in ${params[@]}; do - for net in ${supportedNets[@]}; do - match=false - if [ "$name"x = "$net"x ];then - supported=true - match=true - break 1 - fi - done - if [ "$match" = false ];then - echo "${name} not supported!" - echo "supported nets are: ${supportedNets[@]}" - exit -1 - fi - done - - if [ $1 = "android" ]; then - build_for_android - elif [ $1 = "arm_linux" ]; then - build_for_arm_linux - elif [ $1 = "ios" ]; then - build_for_ios - else - build_error "$1" - fi -fi diff --git a/mobile/tools/build_android_armv7.sh b/mobile/tools/build_android_armv7.sh deleted file mode 100755 index 9466aa300e..0000000000 --- a/mobile/tools/build_android_armv7.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env bash - -# merge cl to so -merge_cl_to_so=1 -opencl_kernels="opencl_kernels.cpp" -cd ../src/operators/kernel/cl -if [[ -f "${opencl_kernels}" ]]; then - rm "${opencl_kernels}" -fi -python gen_code.py "${merge_cl_to_so}" >"${opencl_kernels}" -cd - - -# get cl headers -opencl_header_dir="../third_party/opencl/OpenCL-Headers" -commit_id="320d7189b3e0e7b6a8fc5c10334c79ef364b5ef6" -if [[ -d "$opencl_header_dir" && -d "$opencl_header_dir/.git" ]]; then - echo "pulling opencl headers" - cd $opencl_header_dir - git stash - git pull - git checkout $commit_id - cd - -else - echo "cloning opencl headers" - rm -rf $opencl_header_dir - git clone https://github.com/KhronosGroup/OpenCL-Headers $opencl_header_dir - git checkout $commit_id -fi - -build_for_android() { - # rm -rf "../build" - if [ -z "${NDK_ROOT}" ]; then - echo "NDK_ROOT not found!" - exit -1 - fi - - if [ -z "$PLATFORM" ]; then - PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform. - # PLATFORM="arm-v8a" - fi - - if [ "${PLATFORM}" = "arm-v7a" ]; then - ABI="armeabi-v7a with NEON" - ARM_PLATFORM="V7" - CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security" - elif [ "${PLATFORM}" = "arm-v8a" ]; then - ABI="arm64-v8a" - ARM_PLATFORM="V8" - CXX_FLAGS="-march=armv8-a -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold" - else - echo "unknown platform!" - exit -1 - fi - - MODE="Release" - ANDROID_PLATFORM_VERSION="android-19" - TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" - ANDROID_ARM_MODE="arm" - - cmake .. \ - -B"../buildreleasev7/${PLATFORM}" \ - -DANDROID_ABI="${ABI}" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ - -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \ - -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ - -DANDROID_STL=c++_static \ - -DANDROID=true \ - -DWITH_LOGGING=OFF \ - -DWITH_PROFILE=OFF \ - -DWITH_TEST=OFF \ - -D"${ARM_PLATFORM}"=true - - cd "../buildreleasev7/${PLATFORM}" - make -j 8 -} - -build_for_android diff --git a/mobile/tools/build_android_armv8.sh b/mobile/tools/build_android_armv8.sh deleted file mode 100755 index 3517227eaa..0000000000 --- a/mobile/tools/build_android_armv8.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env bash - -# merge cl to so -merge_cl_to_so=1 -opencl_kernels="opencl_kernels.cpp" -cd ../src/operators/kernel/cl -if [[ -f "${opencl_kernels}" ]]; then - rm "${opencl_kernels}" -fi -python gen_code.py "${merge_cl_to_so}" >"${opencl_kernels}" -cd - - -# get cl headers -opencl_header_dir="../third_party/opencl/OpenCL-Headers" -commit_id="320d7189b3e0e7b6a8fc5c10334c79ef364b5ef6" -if [[ -d "$opencl_header_dir" && -d "$opencl_header_dir/.git" ]]; then - echo "pulling opencl headers" - cd $opencl_header_dir - git stash - git pull - git checkout $commit_id - cd - -else - echo "cloning opencl headers" - rm -rf $opencl_header_dir - git clone https://github.com/KhronosGroup/OpenCL-Headers $opencl_header_dir - git checkout $commit_id -fi - -build_for_android() { - # rm -rf "../build" - if [ -z "${NDK_ROOT}" ]; then - echo "NDK_ROOT not found!" - exit -1 - fi - - if [ -z "$PLATFORM" ]; then - # PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform. - PLATFORM="arm-v8a" - fi - - if [ "${PLATFORM}" = "arm-v7a" ]; then - ABI="armeabi-v7a with NEON" - ARM_PLATFORM="V7" - CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security" - elif [ "${PLATFORM}" = "arm-v8a" ]; then - ABI="arm64-v8a" - ARM_PLATFORM="V8" - CXX_FLAGS="-march=armv8-a -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold" - else - echo "unknown platform!" - exit -1 - fi - - MODE="Release" - ANDROID_PLATFORM_VERSION="android-19" - TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" - ANDROID_ARM_MODE="arm" - - cmake .. \ - -B"../buildreleasev8/${PLATFORM}" \ - -DANDROID_ABI="${ABI}" \ - -DCMAKE_BUILD_TYPE="${MODE}" \ - -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ - -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \ - -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ - -DANDROID_STL=c++_static \ - -DANDROID=true \ - -DWITH_LOGGING=OFF \ - -DWITH_PROFILE=OFF \ - -DWITH_TEST=OFF \ - -D"${ARM_PLATFORM}"=true - - cd "../buildreleasev8/${PLATFORM}" - make -j 8 -} - -build_for_android diff --git a/mobile/tools/ci_build.sh b/mobile/tools/ci_build.sh deleted file mode 100755 index 8bd892c22d..0000000000 --- a/mobile/tools/ci_build.sh +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e -source ./ci_run_test.sh - -function print_usage() { - echo "\n${RED}Usage${NONE}: - ${BOLD}${SCRIPT_NAME}${NONE} [Option] [Network]" - - echo "\n${RED}Option${NONE}: required, specify the target platform - ${BLUE}android_armv7${NONE}: run build for android armv7 platform - ${BLUE}android_armv8${NONE}: run build for android armv8 platform - ${BLUE}ios${NONE}: run build for apple ios platform - ${BLUE}linux_armv7${NONE}: run build for linux armv7 platform - ${BLUE}linux_armv8${NONE}: run build for linux armv8 platform - ${BLUE}fpga${NONE}: run build for fpga platform - " - echo "\n${RED}Network${NONE}: optional, for deep compressing the framework size - ${BLUE}googlenet${NONE}: build only googlenet support - ${BLUE}mobilenet${NONE}: build only mobilenet support - ${BLUE}yolo${NONE}: build only yolo support - ${BLUE}squeezenet${NONE}: build only squeezenet support - ${BLUE}resnet${NONE}: build only resnet support - ${BLUE}mobilenetssd${NONE}: build only mobilenetssd support - ${BLUE}nlp${NONE}: build only nlp model support - ${BLUE}mobilenetfssd${NONE}: build only mobilenetfssd support - ${BLUE}genet${NONE}: build only genet support - ${BLUE}super${NONE}: build only super support - " -} - -function init() { - RED='\033[0;31m' - BLUE='\033[0;34m' - BOLD='\033[1m' - NONE='\033[0m' - - PADDLE_MOBILE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )" - if [ -z "${SCRIPT_NAME}" ]; then - SCRIPT_NAME=$0 - fi -} - -function check_ndk() { - if [ -z "${NDK_ROOT}" ]; then - echo "Should set NDK_ROOT as your android ndk path, such as\n" - echo " export NDK_ROOT=~/android-ndk-r14b\n" - exit -1 - fi -} - -function build_android_armv7_cpu_only() { -# rm -rf ../build/armeabi-v7a - cmake .. \ - -B"../build/armeabi-v7a" \ - -DANDROID_ABI="armeabi-v7a with NEON" \ - -DCMAKE_BUILD_TYPE="MinSizeRel" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \ - -DANDROID_PLATFORM="android-22" \ - -DANDROID_STL=c++_static \ - -DANDROID=true \ - -DWITH_LOGGING=OFF \ - -DCPU=ON \ - -DGPU_CL=OFF \ - -DFPGA=OFF - - cd ../build/armeabi-v7a && make -j 8 - cd - -} - -function build_android_armv7_gpu() { - rm -rf ../build/armeabi-v7a - cmake .. \ - -B"../build/armeabi-v7a" \ - -DANDROID_ABI="armeabi-v7a with NEON" \ - -DCMAKE_BUILD_TYPE="MinSizeRel" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \ - -DANDROID_PLATFORM="android-22" \ - -DANDROID_STL=c++_static \ - -DANDROID=true \ - -DWITH_LOGGING=OFF \ - -DCPU=ON \ - -DGPU_CL=ON \ - -DFPGA=OFF - - cd ../build/armeabi-v7a && make -j 8 - cd - -} - -function build_android_armv8_cpu_only() { - rm -rf ../build/arm64-v8a - cmake .. \ - -B"../build/arm64-v8a" \ - -DANDROID_ABI="arm64-v8a" \ - -DCMAKE_BUILD_TYPE="MinSizeRel" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \ - -DANDROID_PLATFORM="android-22" \ - -DANDROID_STL=c++_static \ - -DANDROID=true \ - -DWITH_LOGGING=OFF \ - -DCPU=ON \ - -DGPU_CL=OFF \ - -DFPGA=OFF - - cd ../build/arm64-v8a && make -j 1 - cd - -} - -function build_android_armv8_gpu() { - rm -rf ../build/arm64-v8a - cmake .. \ - -B"../build/arm64-v8a" \ - -DANDROID_ABI="arm64-v8a" \ - -DCMAKE_BUILD_TYPE="MinSizeRel" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \ - -DANDROID_PLATFORM="android-22" \ - -DANDROID_STL=c++_static \ - -DANDROID=true \ - -DWITH_LOGGING=OFF \ - -DCPU=ON \ - -DGPU_CL=ON \ - -DFPGA=OFF - - cd ../build/arm64-v8a && make -j 8 - cd - -} - -function build_ios_armv8_cpu_only() { - rm -rf ../build/ios - cmake .. \ - -B"../build/ios" \ - -DCMAKE_BUILD_TYPE="MinSizeRel" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" \ - -DIOS_PLATFORM=OS \ - -DIOS_ARCH="${IOS_ARCH}" \ - -DIS_IOS=true \ - -DUSE_OPENMP=OFF \ - -DCPU=ON \ - -DGPU_CL=OFF \ - -DFPGA=OFF - - cd ../build/ios && make -j 8 - cd - -} - -function build_ios_armv8_gpu() { - rm -rf ../build/ios - cmake .. \ - -B"../build/ios" \ - -DCMAKE_BUILD_TYPE="MinSizeRel" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" \ - -DIOS_PLATFORM=OS \ - -DIOS_ARCH="${IOS_ARCH}" \ - -DIS_IOS=true \ - -DUSE_OPENMP=OFF \ - -DCPU=ON \ - -DGPU_CL=ON \ - -DFPGA=OFF - - cd ../build/ios && make -j 8 - cd - -} - -function build_linux_armv7_cpu_only() { - rm -rf ../build/armv7_linux - cmake .. \ - -B"../build/armv7_linux" \ - -DCMAKE_BUILD_TYPE="MinSizeRel" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \ - -DCPU=ON \ - -DGPU_CL=OFF \ - -DFPGA=OFF - - cd ../build/armv7_linux && make -j 8 - cd - -} - -function build_linux_armv7_gpu() { - rm -rf ../build/armv7_linux - cmake .. \ - -B"../build/armv7_linux" \ - -DCMAKE_BUILD_TYPE="MinSizeRel" \ - -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \ - -DCPU=ON \ - -DGPU_CL=ON \ - -DFPGA=OFF - - cd ../build/armv7_linux && make -j 8 - cd - -} - -function build_android_armv7() { - check_ndk - build_android_armv7_cpu_only - # build_android_armv7_gpu -} - -function build_android_armv8() { - check_ndk - build_android_armv8_cpu_only - # build_android_armv8_gpu -} - -function build_ios() { - build_ios_armv8_cpu_only - # build_ios_armv8_gpu -} - -function build_linux_armv7() { - build_linux_armv7_cpu_only - # build_linux_armv7_gpu -} - -function build_linux_fpga() { - cd .. - image=`docker images paddle-mobile:dev | grep 'paddle-mobile'` - if [[ "x"$image == "x" ]]; then - docker build -t paddle-mobile:dev - < Dockerfile - fi - docker run --rm -v `pwd`:/workspace paddle-mobile:dev bash /workspace/tools/docker_build_fpga.sh - cd - -} - -function run_android_test() { - ExecuteAndroidTests $1 -} - -function main() { - local CMD=$1 - init - case $CMD in - android_armv7) - build_android_armv7 - run_android_test armeabi-v7a - ;; - android_armv8) - build_android_armv8 - run_android_test arm64-v8a - ;; - ios) - build_ios - ;; - linux_armv7) - build_linux_armv7 - ;; - fpga) - build_linux_fpga - ;; - *) - print_usage - exit 0 - ;; - esac -} - -main $@ diff --git a/mobile/tools/ci_run_test.sh b/mobile/tools/ci_run_test.sh deleted file mode 100644 index 6470a97b15..0000000000 --- a/mobile/tools/ci_run_test.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash - -operators= - -function AddTest() { - operators="${operators} $1" -} - -function ExecuteAndroidTests() { - platform=$1 - devices=`adb devices | grep -v devices | grep device | awk -F ' ' '{print $1}'` - for device in ${devices}; do - adb -s ${device} shell rm -rf /data/local/tmp/* - adb -s ${device} push ../build/${platform}/build/libpaddle-mobile.so /data/local/tmp/ - for op in ${operators}; do - adb -s ${device} push ../test/build/test-${op}-op /data/local/tmp/ - adb -s ${device} shell "cd /data/local/tmp/; LD_LIBRARY_PATH=. ./test-${op}-op" - echo "${BLUE}run test ${op} pass${NONE}" - done - done -} - -AddTest batchnorm -AddTest cast -AddTest conv -AddTest dequantize -#AddTest elementwiseadd -AddTest log -AddTest logical-and -AddTest logical-not -AddTest logical-or -AddTest logical-xor -AddTest pool -AddTest quantize -AddTest relu -AddTest relu6 -AddTest sequence-expand -AddTest sequence-pool -AddTest sequence-softmax -AddTest sigmoid -AddTest softmax -AddTest tanh -AddTest topk diff --git a/mobile/tools/docker_build_fpga.sh b/mobile/tools/docker_build_fpga.sh deleted file mode 100644 index 9ca9406f43..0000000000 --- a/mobile/tools/docker_build_fpga.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -apt-get update -apt-get install -y gcc g++ cmake - -cd /workspace && mkdir build -cd build && cmake .. -DCPU=OFF -DGPU_CL=OFF -DFPGA=ON && make -j4 diff --git a/mobile/tools/ios-cmake/ios.toolchain.cmake b/mobile/tools/ios-cmake/ios.toolchain.cmake deleted file mode 100644 index 12dd1721d4..0000000000 --- a/mobile/tools/ios-cmake/ios.toolchain.cmake +++ /dev/null @@ -1,216 +0,0 @@ -# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake -# files which are included with CMake 2.8.4 -# It has been altered for iOS development - -# Options: -# -# IOS_PLATFORM = OS (default) or SIMULATOR or SIMULATOR64 -# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders -# OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch. -# SIMULATOR - used to build for the Simulator platforms, which have an x86 arch. -# -# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder -# By default this location is automatcially chosen based on the IOS_PLATFORM value above. -# If set manually, it will override the default location and force the user of a particular Developer Platform -# -# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder -# By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value. -# In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path. -# If set manually, this will force the use of a specific SDK version - -# Macros: -# -# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE) -# A convenience macro for setting xcode specific properties on targets -# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1") -# -# find_host_package (PROGRAM ARGS) -# A macro used to find executable programs on the host system, not within the iOS environment. -# Thanks to the android-cmake project for providing the command - -# Standard settings -set (CMAKE_SYSTEM_NAME Darwin) -set (CMAKE_SYSTEM_VERSION 1) -set (UNIX True) -set (APPLE True) -set (IOS True) -set (IOS_ARCH armv7 armv7s arm64) - -# Required as of cmake 2.8.10 -set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE) - -# Determine the cmake host system version so we know where to find the iOS SDKs -find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin) -if (CMAKE_UNAME) - exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION) - string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}") -endif (CMAKE_UNAME) - -# Force the compilers to gcc for iOS -#include (CMakeForceCompiler) -#CMAKE_C_COMPILER (/usr/bin/gcc) -#CMAKE_CXX_COMPILER (/usr/bin/g++) -if(USE_OPENMP) - set(CMAKE_C_COMPILER /usr/local/opt/llvm/bin/clang) - set(CMAKE_CXX_COMPILER /usr/local/opt/llvm/bin/clang++) -else() - set(CMAKE_C_COMPILER /usr/bin/gcc) - set(CMAKE_CXX_COMPILER /usr/bin/g++) -endif() -set(CMAKE_AR ar CACHE FILEPATH "" FORCE) - -# Skip the platform compiler checks for cross compiling -set (CMAKE_CXX_COMPILER_WORKS TRUE) -set (CMAKE_C_COMPILER_WORKS TRUE) - -# All iOS/Darwin specific settings - some may be redundant -set (CMAKE_SHARED_LIBRARY_PREFIX "lib") -set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") -set (CMAKE_SHARED_MODULE_PREFIX "lib") -set (CMAKE_SHARED_MODULE_SUFFIX ".so") -set (CMAKE_MODULE_EXISTS 1) -set (CMAKE_DL_LIBS "") - -set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") -set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") -set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") -set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") - -# Hidden visibilty is required for cxx on iOS -set (CMAKE_C_FLAGS_INIT "") -set (CMAKE_CXX_FLAGS_INIT "-fvisibility=hidden -fvisibility-inlines-hidden") - -set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") -set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") - -set (CMAKE_PLATFORM_HAS_INSTALLNAME 1) -set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names") -set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names") -set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") -set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") -set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a") - -# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree -# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache -# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun) -# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex -if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) - find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) -endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) - -# Setup iOS platform unless specified manually with IOS_PLATFORM -if (NOT DEFINED IOS_PLATFORM) - set (IOS_PLATFORM "OS") -endif (NOT DEFINED IOS_PLATFORM) -set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") - -# Setup building for arm64 or not -if (NOT DEFINED BUILD_ARM64) - set (BUILD_ARM64 true) -endif (NOT DEFINED BUILD_ARM64) -set (BUILD_ARM64 ${BUILD_ARM64} CACHE STRING "Build arm64 arch or not") - -# Check the platform selection and setup for developer root -if (${IOS_PLATFORM} STREQUAL "OS") - set (IOS_PLATFORM_LOCATION "iPhoneOS.platform") - - # This causes the installers to properly locate the output libraries - set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos") -elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR") - set (SIMULATOR true) - set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform") - - # This causes the installers to properly locate the output libraries - set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator") -elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64") - set (SIMULATOR true) - set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform") - - # This causes the installers to properly locate the output libraries - set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator") -else (${IOS_PLATFORM} STREQUAL "OS") - message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose OS or SIMULATOR") -endif (${IOS_PLATFORM} STREQUAL "OS") - -# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT -# Note Xcode 4.3 changed the installation location, choose the most recent one available -exec_program(/usr/bin/xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR) -set (XCODE_POST_43_ROOT "${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer") -set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer") -if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT) - if (EXISTS ${XCODE_POST_43_ROOT}) - set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT}) - elseif(EXISTS ${XCODE_PRE_43_ROOT}) - set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT}) - endif (EXISTS ${XCODE_POST_43_ROOT}) -endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT) -set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform") - -set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk") -# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT -if (NOT DEFINED CMAKE_IOS_SDK_ROOT) - file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*") - if (_CMAKE_IOS_SDKS) - list (SORT _CMAKE_IOS_SDKS) - list (REVERSE _CMAKE_IOS_SDKS) - list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT) - else (_CMAKE_IOS_SDKS) - message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.") - endif (_CMAKE_IOS_SDKS) - message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}") -endif (NOT DEFINED CMAKE_IOS_SDK_ROOT) -set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK") - -# Set the sysroot default to the most recent SDK -set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support") - -# set the architecture for iOS -if (${IOS_PLATFORM} STREQUAL "OS") -elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR") - set (IOS_ARCH i386) -elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64") - set (IOS_ARCH x86_64) -endif (${IOS_PLATFORM} STREQUAL "OS") - -set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") - -# Set the find root to the iOS developer roots and to user defined paths -set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root") - -# default to searching for frameworks first -set (CMAKE_FIND_FRAMEWORK FIRST) - -# set up the default search directories for frameworks -set (CMAKE_SYSTEM_FRAMEWORK_PATH - ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks - ${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks - ${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks - ) - -# only search the iOS sdks, not the remainder of the host filesystem -set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) -set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) - - -# This little macro lets you set any XCode specific property -macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE) - set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE}) -endmacro (set_xcode_property) - - -# This macro lets you find executable programs on the host system -macro (find_host_package) - set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) - set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) - set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) - set (IOS FALSE) - - find_package(${ARGN}) - - set (IOS TRUE) - set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) - set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) - set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -endmacro (find_host_package) - diff --git a/mobile/tools/net-detail.awk b/mobile/tools/net-detail.awk deleted file mode 100644 index 84d0166ac7..0000000000 --- a/mobile/tools/net-detail.awk +++ /dev/null @@ -1,91 +0,0 @@ -BEGIN { -print "digraph G {" -} -/op:/ { - id++ - opname[id] = $NF -} -/input/ { - type = "input" - para = $NF - if (input[id]) { - input[id] = input[id] "|" - } - input[id] = input[id] "<" para ">" para -} -/output/ { - type = "output" - para = $NF - if (output[id]) { - output[id] = output[id] "|" - } - output[id] = output[id] "<" para ">" para -} -/attr/ { - type = "attr" - aname = $NF - if (attr_key[id]) { - attr_key[id] = attr_key[id] "|" - attr_value[id] = attr_value[id] "|" - } - attr_key[id] = attr_key[id] $NF -} -/argument/ { - if (type == "attr") { - split($0, arr, " - ") - attr_value[id] = attr_value[id] arr[2] - } else if ((type == "input") || (type == "output")) { - if (!var2id[$NF]) { - var_id++ - var[var_id] = $NF - var2id[$NF] = var_id - } - varid = var2id[$NF] - lid++ - if (type == "input") { - line[lid] = "var_" varid " -> " "op_" id ":<" para ">" - if (xout[$NF]) { - xi++ - xline[xi] = "xop_" xout[$NF] " -> " "xop_" id - } - } else if (type == "output") { - line[lid] = "op_" id ":<" para ">" " -> " "var_" varid - xout[$NF] = id - } - } -} -/var name/ { - varname = $NF - vid = var2id[varname] -} -/var tensor desc dim / { - if (tensor[vid]) tensor[vid] = tensor[vid] " x " - tensor[vid] = tensor[vid] $NF -} -END { - -print "subgraph cluster_G0 {" -for (i = 1; i <= id; i++) { - print "xop_" i "[label=\"" i ". " opname[i] "\"]" -} -for (i = 1; i <= xi; i++) { - print xline[i] -} -print "}" - -for (i = 1; i <= id; i++) { -print "op_" i "[group=op;shape=record;label=\"{{" input[i] "}|" i ". " opname[i] "|{" output[i] "}}\"]" -} -for (i = 1; i <= var_id; i++) { -print "var_" i "[label=\"" var[i] " [" tensor[i] "]\"]" -} -for (i = 1; i <= lid; i++) { -print line[i] -} -for (i = 1; i <= id; i++) { -print "attr_" i "[shape=record;label=\"{" attr_key[i] "}|{" attr_value[i] "}\"]" -print "attr_" i " -> " "op_" i ":" -} -print "}" -} - diff --git a/mobile/tools/net.awk b/mobile/tools/net.awk deleted file mode 100644 index 25689c90d8..0000000000 --- a/mobile/tools/net.awk +++ /dev/null @@ -1,27 +0,0 @@ -BEGIN { - print "digraph {" -} -/op:/ { - id++ - op = $NF - opname = op "_" id - print opname "[\"label\"=\"" op " [" id "]" "\"]" -} -/input/ { - type = "input" -} -/output/ { - type = "output" -} -/argument/ { - if (type == "output") { - output[$NF] = opname - } else if (type == "input") { - if (output[$NF]) { - print output[$NF] " -> " opname - } - } -} -END { - print "}" -} diff --git a/mobile/tools/op.cmake b/mobile/tools/op.cmake deleted file mode 100755 index 44f2bc0f08..0000000000 --- a/mobile/tools/op.cmake +++ /dev/null @@ -1,770 +0,0 @@ -set(FOUND_MATCH OFF) -set(CON -1) - -message(STATUS "nets :${NET}") - -list(FIND NET "googlenet" CON) -if (CON GREATER -1) - message("googlenet enabled") - set(CONCAT_OP ON) - set(CONV_OP ON) - set(LRN_OP ON) - set(MUL_OP ON) - set(ELEMENTWISEADD_OP ON) - set(FUSION_FC_OP ON) - set(POOL_OP ON) - set(RELU_OP ON) - set(FUSION_CONVADD_OP ON) - set(FUSION_CONVADDRELU_OP ON) - - set(FOUND_MATCH ON) -endif() - -list(FIND NET "mobilenet" CON) -if (CON GREATER -1) - message("mobilenet enabled") - set(CONV_OP ON) - set(ELEMENTWISEADD_OP ON) - set(RELU_OP ON) - set(SOFTMAX_OP ON) - set(MUL_OP ON) - set(DEPTHWISECONV_OP ON) - set(BATCHNORM_OP ON) - set(POOL_OP ON) - set(RESHAPE_OP ON) - set(FUSION_CONVADDBNRELU_OP ON) - set(FUSION_CONVADDRELU_OP ON) - set(FUSION_CONVADD_OP ON) - - set(FOUND_MATCH ON) -endif() - - -list(FIND NET "mobilenetssd" CON) -if (CON GREATER -1) - message("mobilenetssd enabled") - set(FUSION_CONVBNRELU_OP ON) - set(FUSION_CONVBNRELU_OP ON) - set(FUSION_DWCONVBNRELU_OP ON) - set(FUSION_CONVADD_OP ON) - set(MULTICLASSNMS_OP ON) - set(SOFTMAX_OP ON) - set(TRANSPOSE_OP ON) - #feed - set(PRIORBOX_OP ON) - set(CONCAT_OP ON) - set(BOXCODER_OP ON) - set(RESHAPE_OP ON) -#fetch - #total - - set(FOUND_MATCH ON) - -endif() - - -list(FIND NET "yolo" CON) -if (CON GREATER -1) - message("yolo enabled") - set(BATCHNORM_OP ON) - set(CONV_OP ON) - set(RELU_OP ON) - set(ELEMENTWISEADD_OP ON) - - set(FOUND_MATCH ON) -endif() - -list(FIND NET "squeezenet" CON) -if (CON GREATER -1) - message("squeezenet enabled") - set(CONCAT_OP ON) - set(CONV_OP ON) - set(RELU_OP ON) - set(ELEMENTWISEADD_OP ON) - set(POOL_OP ON) - set(RESHAPE_OP ON) - set(SOFTMAX_OP ON) - - set(FOUND_MATCH ON) -endif() - - -list(FIND NET "resnet" CON) -if (CON GREATER -1) - message("resnet enabled") - set(CONCAT_OP ON) - set(CONV_OP ON) - set(RELU_OP ON) - set(ELEMENTWISEADD_OP ON) - set(POOL_OP ON) - set(BATCHNORM_OP ON) - set(FUSION_CONVBNADDRELU_OP ON) - set(MUL_OP ON) - set(RESHAPE_OP ON) - set(SOFTMAX_OP ON) - set(FOUND_MATCH ON) -endif() - -list(FIND NET "FPGA_NET_V1" CON) -if (CON GREATER -1) - message("FPGA_NET_V1 enabled") - set(FUSION_CONVADDRELU_OP ON) - set(FUSION_ELEMENTWISEADDRELU_OP ON) - set(FUSION_FC_OP ON) - set(POOL_OP ON) - set(SOFTMAX_OP ON) - set(FUSION_CONVBNRELU_OP ON) - set(FUSION_CONVBN_OP ON) - set(TANH_OP ON) - set(ELEMENTWISEADD_OP ON) - set(TRANSPOSE2_OP ON) - set(FUSION_CONVADD_OP ON) - set(SPLIT_OP ON) - set(FUSION_DECONVADD_OP ON) - set(FUSION_DECONVADDRELU_OP ON) - - set(RESHAPE_OP ON) - set(FUSION_CONVADDBNRELU_OP ON) - set(FUSION_CONVADDBN_OP ON) - set(RESHAPE2_OP ON) - set(PSROI_POOL_OP ON) - set(ROIALIGN_POOL_OP ON) - set(PROPOSAL_OP ON) - set(ANCHOR_GENERATOR_OP ON) - set(SLICE_OP ON) - set(SIGMOID_OP ON) - set(CONCAT_OP ON) - set(PAD2D_OP ON) - set(CONV_TRANSPOSE_OP ON) - set(FUSION_DECONVADDBNRELU_OP ON) - set(FUSION_DECONVADDBN_OP ON) - set(FUSION_DECONVBNRELU_OP ON) - set(CONV_OP ON) - set(ELEMENTWISEMUL_OP ON) - set(FUSION_FCRELU_OP ON) - set(RELU_OP ON) - set(FOUND_MATCH ON) -endif() - -list(FIND NET "FPGA_NET_V2" CON) -if (CON GREATER -1) - message("FPGA_NET_V2 enabled") - set(FUSION_CONVADDRELU_OP ON) - set(FUSION_ELEMENTWISEADDRELU_OP ON) - set(FUSION_FC_OP ON) - set(POOL_OP ON) - set(SOFTMAX_OP ON) - set(FUSION_CONVBNRELU_OP ON) - set(FUSION_CONVBN_OP ON) - set(TANH_OP ON) - set(ELEMENTWISEADD_OP ON) - set(TRANSPOSE2_OP ON) - set(FUSION_CONVADD_OP ON) - set(SPLIT_OP ON) - set(FUSION_DECONVADD_OP ON) - set(FUSION_DECONVADDRELU_OP ON) - - set(RESHAPE_OP ON) - set(FUSION_CONVADDBNRELU_OP ON) - set(FUSION_CONVADDBN_OP ON) - set(RESHAPE2_OP ON) - set(PSROI_POOL_OP ON) - set(ROIALIGN_POOL_OP ON) - set(PROPOSAL_OP ON) - set(ANCHOR_GENERATOR_OP ON) - set(SLICE_OP ON) - set(SIGMOID_OP ON) - set(CONCAT_OP ON) - set(CONV_TRANSPOSE_OP ON) - set(FUSION_DECONVADDBNRELU_OP ON) - set(FUSION_DECONVADDBN_OP ON) - set(FUSION_DECONVBNRELU_OP ON) - set(CONV_OP ON) - set(ELEMENTWISEMUL_OP ON) - set(FUSION_FCRELU_OP ON) - set(RELU_OP ON) - set(FOUND_MATCH ON) -endif() - -list(FIND NET "FPGA_OPS_KD" CON) -if (CON GREATER -1) - message("FPGA_OPS_KD enabled") - set(CONV_OP ON) - set(FUSION_CONVADDRELU_OP ON) - set(FUSION_ELEMENTWISEADDRELU_OP ON) - set(FUSION_FC_OP ON) - set(POOL_OP ON) - set(SOFTMAX_OP ON) - set(FUSION_CONVBNRELU_OP ON) - set(FUSION_CONVBN_OP ON) - set(TANH_OP ON) - set(ELEMENTWISEADD_OP ON) - set(TRANSPOSE2_OP ON) - set(FUSION_CONVADD_OP ON) - set(SPLIT_OP ON) - set(FUSION_DECONVADD_OP ON) - set(FUSION_DECONVADDRELU_OP ON) - set(FOUND_MATCH ON) -endif() - -list(FIND NET "nlp" CON) -if (CON GREATER -1) - message("nlp enabled") - set(FUSION_FC_OP ON) - set(LOOKUP_OP ON) - set(GRU_OP ON) - set(CRF_OP ON) - set(CONCAT_OP ON) - set(ELEMENTWISEADD_OP ON) - - - set(FOUND_MATCH ON) -endif() - -list(FIND NET "mobilenetfssd" CON) -if (CON GREATER -1) - message("mobilenetfssd enabled") - set(FUSION_CONVADDRELU_OP ON) - set(FUSION_CONVADDBNRELU_OP ON) - set(FUSION_CONVADD_OP ON) - set(SOFTMAX_OP ON) - set(RESHAPE_OP ON) - set(BILINEAR_INTERP_OP ON) - set(TRANSPOSE_OP ON) - set(CONCAT_OP ON) - set(PRIORBOX_OP ON) - set(BATCHNORM_OP ON) - set(BOXCODER_OP ON) - set(MULTICLASSNMS_OP ON) - set(FLATTEN_OP ON) - set(FLATTEN2_OP ON) - set(SPLIT_OP ON) - set(SHAPE_OP ON) - - set(FOUND_MATCH ON) -endif() - -list(FIND NET "genet" CON) -if (CON GREATER -1) - message("genet enabled") - set(FUSION_CONVADDPRELU_OP ON) - set(FUSION_CONVADDADDPRELU_OP ON) - set(FUSION_CONVADD_OP ON) - set(CONV_TRANSPOSE_OP ON) - set(FUSION_CONVADDRELU_OP ON) - set(ELEMENTWISEADD_OP ON) - set(PRELU_OP ON) - set(POOL_OP ON) - set(CONCAT_OP ON) - - set(FOUND_MATCH ON) -endif() - -list(FIND NET "super" CON) -if (CON GREATER -1) - message("super enabled") - set(FUSION_CONVADD_OP ON) - set(FUSION_CONVADDRELU_OP ON) - set(ELEMENTWISEADD_OP ON) - - set(FOUND_MATCH ON) -endif() - -list(FIND NET "op" CON) -if (CON GREATER -1) - message("op enabled") -# set(SIGMOID_OP ON) -# set(LEAKY_RELU_OP ON) - set(BLOG ON) - set(FOUND_MATCH ON) -endif() - -if(NOT FOUND_MATCH) - message("--default--") - set(NORM_OP ON) - set(BATCHNORM_OP ON) - set(INSTANCENORM_OP ON) - set(CONV_TRANSPOSE_OP ON) - set(BOXCODER_OP ON) - set(CONCAT_OP ON) - set(CONV_OP ON) - set(DEPTHWISECONV_OP ON) - set(ELEMENTWISEADD_OP ON) - set(ELEMENTWISESUB_OP ON) - set(IM2SEQUENCE_OP ON) - set(FILL_CONSTANT_OP ON) - set(DENSITY_PRIORBOX_OP ON) - set(FUSION_CONVADD_OP ON) - set(FUSION_CONVADDPRELU_OP ON) - set(EXP_OP ON) - set(FUSION_CONVADDRELU_OP ON) - set(FUSION_FC_OP ON) - set(LRN_OP ON) - set(MUL_OP ON) - set(MULTICLASSNMS_OP ON) - set(POLYGONBOXTRANSFORM_OP ON) - set(POOL_OP ON) - set(PRIORBOX_OP ON) - set(RELU_OP ON) - set(RESHAPE_OP ON) - set(RESHAPE2_OP ON) - set(SIGMOID_OP ON) - set(SOFTMAX_OP ON) - set(TRANSPOSE_OP ON) - set(TRANSPOSE2_OP ON) - set(FUSION_CONVADDBNRELU_OP ON) - set(FUSION_CONVADDADDPRELU_OP ON) - set(FUSION_DWCONVBNRELU_OP ON) - set(FUSION_CONVBNRELU_OP ON) - set(FUSION_CONVRELU_OP ON) - set(FUSION_CONVBNADDRELU_OP ON) - set(PRELU_OP ON) - set(RESIZE_OP ON) - set(SCALE_OP ON) - set(SLICE_OP ON) - set(DROPOUT_OP ON) - set(IM2SEQUENCE_OP ON) - set(LOOKUP_OP ON) - set(GRU_OP ON) - set(GRU_UNIT_OP ON) - set(CRF_OP ON) - set(BILINEAR_INTERP_OP ON) - set(SPLIT_OP ON) - set(FLATTEN_OP ON) - set(FLATTEN2_OP ON) - set(SHAPE_OP ON) - set(ELEMENTWISEMUL_OP ON) - set(SUM_OP ON) - set(TOP_K_OP ON) - set(CAST_OP ON) - set(QUANT_OP ON) - set(DEQUANT_OP ON) - set(FUSION_DEQUANT_BN_OP ON) - set(FUSION_DEQUANT_ADD_BN_OP ON) - set(FUSION_DEQUANT_BN_RELU_OP ON) - set(FUSION_DEQUANT_ADD_BN_RELU_OP ON) - set(FUSION_DEQUANT_ADD_BN_QUANT_OP ON) - set(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP ON) - set(SEQUENCE_EXPAND_OP ON) - set(SEQUENCE_POOL_OP ON) - set(SEQUENCE_SOFTMAX_OP ON) - set(LOG_OP ON) - set(TANH_OP ON) - set(LOD_RESET_OP ON) - set(LESS_THAN_OP ON) - set(LOGICAL_AND_OP ON) - set(LOGICAL_OR_OP ON) - set(LOGICAL_NOT_OP ON) - set(LOGICAL_XOR_OP ON) - set(WHILE_OP ON) - set(WRITE_TO_ARRAY_OP ON) - set(READ_FROM_ARRAY_OP ON) - set(IS_EMPTY_OP ON) - set(INCREMENT_OP ON) - set(ANCHOR_GENERATOR_OP ON) - set(PROPOSAL_OP ON) - set(PSROI_POOL_OP ON) - set(ROI_PERSPECTIVE_OP ON) - set(BEAM_SEARCH_OP ON) - set(BEAM_SEARCH_DECODE_OP ON) - set(PAD2D_OP ON) - set(ONE_HOT_OP ON) - set(ASSIGN_VALUE_OP ON) - set(NEAREST_INTERP_OP ON) - set(LEAKY_RELU_OP ON) - set(ASSIGN_OP ON) - set(CONDITIONAL_BLOCK_OP ON) - set(EQUAL_OP ON) - set(FILL_CONSTANT_BATCH_SIZE_LIKE_OP ON) - set(RANGE_OP ON) - set(REDUCE_PROD_OP ON) - set(FUSION_INSTANCENORM_RELU_OP ON) - set(PIXEL_SHUFFLE_OP ON) - set(EXPAND_OP ON) - set(GRID_SAMPLER_OP ON) -endif() - - # option(BATCHNORM_OP "" ON) - # option(BOXCODER_OP "" ON) - # option(CONCAT_OP "" ON) - # option(CONV_OP "" ON) - # option(DEPTHWISECONV_OP "" ON) - # option(ELEMENTWISEADD_OP "" ON) - # option(FILL_CONSTANT_OP "" ON) - # option(FUSION_CONVADD_OP "" ON) - # option(FUSION_CONVADDRELU_OP "" ON) - # option(FUSION_FC_OP "" ON) - # option(LRN_OP "" ON) - # option(MUL_OP "" ON) - # option(MULTICLASSNMS_OP "" ON) - # option(POLYGONBOXTRANSFORM_OP "" ON) - # option(POOL_OP "" ON) - # option(PRIORBOX_OP "" ON) - # option(RELU_OP "" ON) - # option(RESHAPE_OP "" ON) - # option(RESHAPE2_OP "" ON) - # option(SIGMOID_OP "" ON) - # option(SOFTMAX_OP "" ON) - # option(TRANSPOSE_OP "" ON) - # option(TRANSPOSE2_OP "" ON) -# endif () - -if (NORM_OP) - add_definitions(-DNORM_OP) -endif() -if (BATCHNORM_OP) - add_definitions(-DBATCHNORM_OP) -endif() -if (INSTANCENORM_OP) - add_definitions(-DINSTANCENORM_OP) -endif() -if (FUSION_INSTANCENORM_RELU_OP) - add_definitions(-DFUSION_INSTANCENORM_RELU_OP) -endif() -if (BOXCODER_OP) - add_definitions(-DBOXCODER_OP) -endif() -if (CONCAT_OP) - add_definitions(-DCONCAT_OP) -endif() -if (CONV_OP) - add_definitions(-DCONV_OP) -endif() -if (DEPTHWISECONV_OP) - add_definitions(-DDEPTHWISECONV_OP) -endif() -if (ELEMENTWISEADD_OP) - add_definitions(-DELEMENTWISEADD_OP) -endif() -if (ELEMENTWISESUB_OP) - add_definitions(-DELEMENTWISESUB_OP) -endif() -if (FILL_CONSTANT_OP) - add_definitions(-DFILL_CONSTANT_OP) -endif() -# if (FUSION_CONVADD_OP) -# add_definitions(-DFUSION_CONVADD_OP) -# endif() -if (FUSION_CONVADDRELU_OP) - add_definitions(-DFUSION_CONVADDRELU_OP) -endif() -if (FUSION_CONVADDPRELU_OP) - add_definitions(-DFUSION_CONVADDPRELU_OP) -endif() -if (FUSION_CONVADDADDPRELU_OP) - add_definitions(-DFUSION_CONVADDADDPRELU_OP) -endif() -if (FUSION_FC_OP) - add_definitions(-DFUSION_FC_OP) -endif() -if (LRN_OP) - add_definitions(-DLRN_OP) -endif() -if (MUL_OP) - add_definitions(-DMUL_OP) -endif() -if (MULTICLASSNMS_OP) - add_definitions(-DMULTICLASSNMS_OP) -endif() -if (POLYGONBOXTRANSFORM_OP) - add_definitions(-DPOLYGONBOXTRANSFORM_OP) -endif() -if (POOL_OP) - add_definitions(-DPOOL_OP) -endif() -if (PRIORBOX_OP) - add_definitions(-DPRIORBOX_OP) -endif() -if (RELU_OP) - add_definitions(-DRELU_OP) -endif() -if (RESHAPE_OP) - add_definitions(-DRESHAPE_OP) -endif() -if (RESHAPE2_OP) - add_definitions(-DRESHAPE2_OP) -endif() -if (SIGMOID_OP) - add_definitions(-DSIGMOID_OP) -endif() -if (SOFTMAX_OP) - add_definitions(-DSOFTMAX_OP) -endif() -if (TRANSPOSE_OP) - add_definitions(-DTRANSPOSE_OP) -endif() -if (TRANSPOSE2_OP) - add_definitions(-DTRANSPOSE2_OP) -endif() -if (FUSION_CONVADDBNRELU_OP) - add_definitions(-DFUSION_CONVADDBNRELU_OP) -endif() -if (FUSION_DWCONVBNRELU_OP) - add_definitions(-DFUSION_DWCONVBNRELU_OP) -endif() - -if (FUSION_CONVBNRELU_OP) - add_definitions(-DFUSION_CONVBNRELU_OP) -endif() - -if (FUSION_CONVRELU_OP) - add_definitions(-DFUSION_CONVRELU_OP) -endif() - -if (FUSION_CONVBNADDRELU_OP) - add_definitions(-DFUSION_CONVBNADDRELU_OP) -endif() - -if (PRELU_OP) - add_definitions(-DPRELU_OP) -endif() -if (RESIZE_OP) - add_definitions(-DRESIZE_OP) -endif() -if (SCALE_OP) - add_definitions(-DSCALE_OP) -endif() -if (SLICE_OP) - add_definitions(-DSLICE_OP) -endif() -if (DROPOUT_OP) - add_definitions(-DDROPOUT_OP) -endif() -if (IM2SEQUENCE_OP) - add_definitions(-DIM2SEQUENCE_OP) -endif() - -if (FUSION_CONVADDBN_OP) - add_definitions(-DFUSION_CONVADDBN_OP) -endif() -if (FUSION_FCRELU_OP) - add_definitions(-DFUSION_FCRELU_OP) -endif() -if (FUSION_POOLBN_OP) - add_definitions(-DFUSION_POOLBN_OP) -endif() -if (FUSION_ELEMENTWISEADDRELU_OP) - add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP) -endif() -if (FUSION_CONVBN_OP) - add_definitions(-DFUSION_CONVBN_OP) -endif() - -if (CONV_TRANSPOSE_OP) - add_definitions(-DCONV_TRANSPOSE_OP) -endif() - -if (LOOKUP_OP) - add_definitions(-DLOOKUP_OP) -endif() - -if (GRU_OP) - add_definitions(-DGRU_OP) -endif() - -if (GRU_UNIT_OP) - add_definitions(-DGRU_UNIT_OP) -endif() - -if (CRF_OP) - add_definitions(-DCRF_OP) -endif() - - -if (FLATTEN_OP) - add_definitions(-DFLATTEN_OP) -endif() - -if (FLATTEN2_OP) - add_definitions(-DFLATTEN2_OP) -endif() - -if (SPLIT_OP) - add_definitions(-DSPLIT_OP) -endif() - -if (BILINEAR_INTERP_OP) - add_definitions(-DBILINEAR_INTERP_OP) -endif() - -if (SHAPE_OP) - add_definitions(-DSHAPE_OP) -endif() - -if (ELEMENTWISEMUL_OP) - add_definitions(-DELEMENTWISEMUL_OP) -endif() -if (SUM_OP) - add_definitions(-DSUM_OP) -endif() -if (TOP_K_OP) - add_definitions(-DTOP_K_OP) -endif() -if (CAST_OP) - add_definitions(-DCAST_OP) -endif() -if (QUANT_OP) - add_definitions(-DQUANT_OP) -endif() -if (DEQUANT_OP) - add_definitions(-DDEQUANT_OP) -endif() -if (FUSION_DEQUANT_BN_OP) - add_definitions(-DFUSION_DEQUANT_BN_OP) -endif() -if (FUSION_DEQUANT_ADD_BN_OP) - add_definitions(-DFUSION_DEQUANT_ADD_BN_OP) -endif() -if (FUSION_DEQUANT_BN_RELU_OP) - add_definitions(-DFUSION_DEQUANT_BN_RELU_OP) -endif() -if (FUSION_DEQUANT_ADD_BN_RELU_OP) - add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_OP) -endif() -if (FUSION_DEQUANT_ADD_BN_QUANT_OP) -# add_definitions(-DFUSION_DEQUANT_ADD_BN_QUANT_OP) -endif() -if (FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP) -# add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_QUANT_OP) -endif() -if (SEQUENCE_EXPAND_OP) - add_definitions(-DSEQUENCE_EXPAND_OP) -endif() -if (SEQUENCE_POOL_OP) - add_definitions(-DSEQUENCE_POOL_OP) -endif() -if (SEQUENCE_SOFTMAX_OP) - add_definitions(-DSEQUENCE_SOFTMAX_OP) -endif() -if (LOG_OP) - add_definitions(-DLOG_OP) -endif() -if (LOD_RESET_OP) - add_definitions(-DLOD_RESET_OP) -endif() -if (LESS_THAN_OP) - add_definitions(-DLESS_THAN_OP) -endif() -if (LOGICAL_AND_OP) - add_definitions(-DLOGICAL_AND_OP) -endif() -if (LOGICAL_OR_OP) - add_definitions(-DLOGICAL_OR_OP) -endif() -if (LOGICAL_NOT_OP) - add_definitions(-DLOGICAL_NOT_OP) -endif() -if (LOGICAL_XOR_OP) - add_definitions(-DLOGICAL_XOR_OP) -endif() - -if (TANH_OP) - add_definitions(-DTANH_OP) -endif() -if (FUSION_DECONVRELU_OP) - add_definitions(-DFUSION_DECONVRELU_OP) -endif() -if (FUSION_DECONVADD_OP) - add_definitions(-DFUSION_DECONVADD_OP) -endif() -if (FUSION_DECONVADDRELU_OP) - add_definitions(-DFUSION_DECONVADDRELU_OP) -endif() -if (WHILE_OP) - add_definitions(-DWHILE_OP) -endif() -if (WRITE_TO_ARRAY_OP) - add_definitions(-DWRITE_TO_ARRAY_OP) -endif() -if (READ_FROM_ARRAY_OP) - add_definitions(-DREAD_FROM_ARRAY_OP) -endif() -if (IS_EMPTY_OP) - add_definitions(-DIS_EMPTY_OP) -endif() -if (INCREMENT_OP) - add_definitions(-DINCREMENT_OP) -endif() - -if (ANCHOR_GENERATOR_OP) - add_definitions(-DANCHOR_GENERATOR_OP) -endif() -if (PROPOSAL_OP) - add_definitions(-DPROPOSAL_OP) -endif() -if (PSROI_POOL_OP) - add_definitions(-DPSROI_POOL_OP) -endif() -if (ROIALIGN_POOL_OP) - add_definitions(-DROIALIGN_POOL_OP) -endif() -if (ROI_PERSPECTIVE_OP) - add_definitions(-DROI_PERSPECTIVE_OP) -endif() -if (BEAM_SEARCH_OP) - add_definitions(-DBEAM_SEARCH_OP) -endif() -if (BEAM_SEARCH_DECODE_OP) - add_definitions(-DBEAM_SEARCH_DECODE_OP) -endif() -if (FUSION_DECONVADDBNRELU_OP) - add_definitions(-DFUSION_DECONVADDBNRELU_OP) -endif() -if (FUSION_DECONVBNRELU_OP) - add_definitions(-DFUSION_DECONVBNRELU_OP) -endif() -if (FUSION_DECONVADDBN_OP) - add_definitions(-DFUSION_DECONVADDBN_OP) -endif() -if (PAD2D_OP) - add_definitions(-DPAD2D_OP) -endif() -if (ONE_HOT_OP) - add_definitions(-DONE_HOT_OP) -endif() -if (ASSIGN_VALUE_OP) - add_definitions(-DASSIGN_VALUE_OP) -endif() -if (LEAKY_RELU_OP) - add_definitions(-DLEAKY_RELU_OP) -endif() -if (NEAREST_INTERP_OP) - add_definitions(-DNEAREST_INTERP_OP) -endif() -if (DENSITY_PRIORBOX_OP) - add_definitions(-DDENSITY_PRIORBOX_OP) -endif() -if (EXP_OP) - add_definitions(-DEXP_OP) -endif () -if (ASSIGN_OP) - add_definitions(-DASSIGN_OP) -endif() -if (CONDITIONAL_BLOCK_OP) - add_definitions(-DCONDITIONAL_BLOCK_OP) -endif() -if (EQUAL_OP) - add_definitions(-DEQUAL_OP) -endif() -if (FILL_CONSTANT_BATCH_SIZE_LIKE_OP) - add_definitions(-DFILL_CONSTANT_BATCH_SIZE_LIKE_OP) -endif() -if (RANGE_OP) - add_definitions(-DRANGE_OP) -endif() -if (REDUCE_PROD_OP) - add_definitions(-DREDUCE_PROD_OP) -endif() -if (PIXEL_SHUFFLE_OP) - add_definitions(-DPIXEL_SHUFFLE_OP) -endif() -if (EXPAND_OP) - add_definitions(-DEXPAND_OP) -endif() -if (GRID_SAMPLER_OP) - add_definitions(-DGRID_SAMPLER_OP) -endif() -if (BLOG) - add_definitions(-DBLOG) -endif() - diff --git a/mobile/tools/pre-commit.hooks/clang-format.hook b/mobile/tools/pre-commit.hooks/clang-format.hook deleted file mode 100644 index ffba8744f4..0000000000 --- a/mobile/tools/pre-commit.hooks/clang-format.hook +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# set -e - -readonly VERSION="5.0" - -version=$(clang-format -version) - -if ! [[ $version == *"$VERSION"* ]]; then - echo "clang-format version check failed." - echo "a version contains '$VERSION' is needed, but get '$version'" - echo "you can install the right version, and make an soft-link to '\$PATH' env" - exit -1 -fi - -# https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/ -shift -perl -i -pe 's|^\s+#pragma\s+omp|// #pragma omp|' "$@" -( -# remove clang format ios_io folder -flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||') -clang-format -i $flist -) -perl -i -pe 's|// ||' "$@" diff --git a/mobile/tools/pre-commit.hooks/clang-tidy.hook b/mobile/tools/pre-commit.hooks/clang-tidy.hook deleted file mode 100755 index 2d7847c330..0000000000 --- a/mobile/tools/pre-commit.hooks/clang-tidy.hook +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -bash -c "cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON" - -TOTAL_ERRORS=0 - -# The trick to remove deleted files: https://stackoverflow.com/a/2413151 -for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | grep "src/" | grep -v ".pb." | grep -v ".h"); do - echo "clang-tidy check $file"; - clang-tidy $file --fix --fix-errors --header-filter=.* - TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); - echo "clang-tidy error TOTAL_ERRORS = $TOTAL_ERRORS . " -done - -rm -f compile_commands.json - -exit $TOTAL_ERRORS - diff --git a/mobile/tools/pre-commit.hooks/copyright.hook b/mobile/tools/pre-commit.hooks/copyright.hook deleted file mode 100644 index 8fc0028059..0000000000 --- a/mobile/tools/pre-commit.hooks/copyright.hook +++ /dev/null @@ -1,124 +0,0 @@ -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import io -import platform -import re -import subprocess - -COPYRIGHT = ''' -Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -''' - -LANG_COMMENT_MARK = None - -NEW_LINE_MARK = None - -COPYRIGHT_HEADER = None - -if platform.system() == "Windows": - NEW_LINE_MARK = "\r\n" -else: - NEW_LINE_MARK = '\n' - COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1] - p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0) - process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE) - date, err = process.communicate() - date = date.decode("utf-8").rstrip("\n") - COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date) - - -def generate_copyright(template, lang='C'): - end_line = "" - if lang == 'Python': - lang_coment_mark = '# ' - start = lang_coment_mark - blank = " " - else: - lang_coment_mark = "" - start = "/* " - blank = "" - end_line = " */" - lines = template.split(NEW_LINE_MARK) - - ans = start + blank + COPYRIGHT_HEADER + NEW_LINE_MARK - - for lino, line in enumerate(lines): - if lino == 0 or lino == 1 or lino == len(lines) - 1: - continue - if lino == (len(lines) - 2): - ans += lang_coment_mark + blank + line + end_line + NEW_LINE_MARK - else: - ans += lang_coment_mark + blank + line + NEW_LINE_MARK - return ans + "\n" - - -def lang_type(filename): - if filename.endswith(".py"): - return "Python" - elif filename.endswith(".h"): - return "C" - elif filename.endswith(".c"): - return "C" - elif filename.endswith(".hpp"): - return "C" - elif filename.endswith(".cc"): - return "C" - elif filename.endswith(".cpp"): - return "C" - else: - print("Unsupported filetype %s", filename) - exit(0) - - -PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)") - - -def main(argv=None): - parser = argparse.ArgumentParser( - description='Checker for copyright declaration.') - parser.add_argument('filenames', nargs='*', help='Filenames to check') - args = parser.parse_args(argv) - - retv = 0 - for filename in args.filenames: - fd = io.open(filename, encoding="utf-8") - first_line = fd.readline() - second_line = fd.readline() - if "COPYRIGHT (C)" in first_line.upper() or "COPYRIGHT (C)" in second_line.upper(): - continue - if first_line.startswith("/*") or first_line.startswith("#!") or PYTHON_ENCODE.match( - second_line) is not None or PYTHON_ENCODE.match(first_line) is not None: - continue - original_contents = io.open(filename, encoding="utf-8").read() - new_contents = generate_copyright( - COPYRIGHT, lang_type(filename)) + original_contents - print('Auto Insert Copyright Header {}'.format(filename)) - retv = 1 - with io.open(filename, 'w') as output_file: - output_file.write(new_contents) - return retv - - -def test_generate_copyright(): - print(generate_copyright(COPYRIGHT)) - - -if __name__ == '__main__': - # test_generate_copyright() - exit(main()) - diff --git a/mobile/tools/pre-commit.hooks/cpplint.hook b/mobile/tools/pre-commit.hooks/cpplint.hook deleted file mode 100644 index 3740e64c73..0000000000 --- a/mobile/tools/pre-commit.hooks/cpplint.hook +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -TOTAL_ERRORS=0 - -# The trick to remove deleted files: https://stackoverflow.com/a/2413151 -for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \ - grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \ - grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "^mobile/tools/quantification"); do - cpplint $file; - TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); -done - -exit $TOTAL_ERRORS diff --git a/mobile/tools/prepare_images_and_models.sh b/mobile/tools/prepare_images_and_models.sh deleted file mode 100755 index 6f224778d9..0000000000 --- a/mobile/tools/prepare_images_and_models.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -# decalre download paths of images and models -PADDLE_MOBILE_ROOT="$(pwd)/../" -IMAGES_AND_MODELS="opencl_test_src" -IMAGES_AND_MODELS_PATH="http://mms-graph.bj.bcebos.com/paddle-mobile/${IMAGES_AND_MODELS}.zip" - -# download and unzip zip-files of images and models -mkdir ${PADDLE_MOBILE_ROOT}/download/ -cd ${PADDLE_MOBILE_ROOT}/download/ -wget -c ${IMAGES_AND_MODELS_PATH} -unzip -o ./${IMAGES_AND_MODELS}.zip - -# create models and images directories below test -mkdir ${PADDLE_MOBILE_ROOT}/test/models -mkdir ${PADDLE_MOBILE_ROOT}/test/images - -# move to test directory -cp ./${IMAGES_AND_MODELS}/input_3x224x224_banana ${PADDLE_MOBILE_ROOT}/test/images/ -cp -r ./${IMAGES_AND_MODELS}/mobilenet ${PADDLE_MOBILE_ROOT}/test/models/ diff --git a/mobile/tools/profile_show.sh b/mobile/tools/profile_show.sh deleted file mode 100644 index d4a4d84e9d..0000000000 --- a/mobile/tools/profile_show.sh +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env sh -cat < - - - - -

-
    -EOF - -min=$(awk 'NR==1{min=$4} NR>1{if($4 < min) min=$4} END{print min}' $1) -max=$(awk 'NR==1{max=$5} NR>1{if($5 > max) max=$5} END{print max}' $1) -sort $1 -k1,1n | awk -v max="$max" -v min="$min" ' -BEGIN { - total = max - min -} -{ - opid = $1 - optype = $2 - tid = $3 - cb = $4 - ce = $5 - cl = $6 - sum += $4 - $3 - print "
  • " -} -' - -cat < -
-
-EOF
-
-echo "==================[ profile ]==================="
-cat $1 | awk '
-NR>1{
-    optype = $2
-    sum += $5 - $4
-    count[$2] += $6
-}
-END {
-for (t in count) {
-    msg = sprintf("%-16s\t%-10d\t%-.4f", t, count[t], count[t]*100 / sum);
-    print msg
-}
-}' | sort -k2,2nr
-cat $1 | awk '
-NR>1{
-    sum += $5 - $4
-}
-END {
-msg = sprintf("%-16s\t%-10d\t%-.4f", "total", sum, 100);
-print msg
-}'
-
-cat <
-
-
-
-EOF
diff --git a/mobile/tools/python/caffetools/run.py b/mobile/tools/python/caffetools/run.py
deleted file mode 100644
index 914ec83f0f..0000000000
--- a/mobile/tools/python/caffetools/run.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import caffe
-import numpy as np
-
-prototxt_path = ""
-caffemodel_path = ""
-input_path = "input.txt"
-input_name = ""
-output_name = ""
-
-shape = (1, 3, 64, 64)
-
-data = np.loadtxt(input_path).astype("float32").reshape(shape)
-
-net = caffe.Net(prototxt_path, caffemodel_path, caffe.TEST)
-
-# view inputs blob names
-print(net.inputs)
-
-# view outputs blob names
-print(net.outputs)
-
-# set input data
-net.blobs[input_name].reshape(*shape)
-net.blobs[input_name].data[...] = data
-
-# predict
-net.forward()
-
-# view output data
-print(net.blobs[output_name].data)
diff --git a/mobile/tools/python/fluidtools/.gitignore b/mobile/tools/python/fluidtools/.gitignore
deleted file mode 100644
index a8dcab2592..0000000000
--- a/mobile/tools/python/fluidtools/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-*
-!run.py
-!.gitignore
-!/model-encrypt-tool
-!test_wrap.py
-!run_multi_feed.py
diff --git a/mobile/tools/python/fluidtools/run.py b/mobile/tools/python/fluidtools/run.py
deleted file mode 100644
index 6f82e426bd..0000000000
--- a/mobile/tools/python/fluidtools/run.py
+++ /dev/null
@@ -1,675 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "model"
-checked_model_path = "checked_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.1
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-quantification = False
-quantification_fold = 1000
-architecture = "arm-v7a"
-# architecture = "arm-v8a"
-correct_persistable = False
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    global correct_persistable
-    if correct_persistable:
-        ops = prog.current_block().ops
-        vars = prog.current_block().vars
-        for op in ops:
-            for var_name in op.output_arg_names:
-                if var_name == "fetch":
-                    continue
-                var = vars[var_name]
-                if var.persistable:
-                    pp_red("has found non-persistable output var : {}".format(var_name))
-                    var.persistable = False
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状,在model和params中一致,并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if not quantification:
-        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params,加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
-               "checked_model/model.ml".format(line))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状,放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var,全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    if len(output_var_filter) > 0:
-        for fetch_name in fetch_names:
-            output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    # for line in lines:
-    #     print(line)
-    for line in lines:
-        if line.startswith("auto-test-debug"):
-            print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    fetch_diff = 0.0
-    fetch_count = 0
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        for i in range(len(values1)):
-            v1 = values1[i]
-            v2 = values2[len(shape) + i]
-            fetch_diff += abs(v1 - v2)
-            fetch_count += 1
-    if fetch_count != 0:
-        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("outputs are incorrect", 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-        if not fuse and not mem_opt:
-            pp_yellow("checking individual ops : ", 1)
-            error_index = None
-            error_values1 = None
-            error_values2 = None
-            checked_names = []
-            fetch_names = []
-            for fetch in fetches:
-                fetch_names.append(fetch.name)
-            for index in op_cache:
-                op_output_var_name, op = op_cache[index]
-                if mem_opt:
-                    found_in_fetch = False
-                    for fetch in fetches:
-                        if op_output_var_name == fetch.name:
-                            found_in_fetch = True
-                            break
-                    if not found_in_fetch:
-                        continue
-                if not op_output_var_name in output_var_cache:
-                    continue
-                if not op_output_var_name in mobile_var_cache:
-                    continue
-                if fuse or mem_opt:
-                    if op_output_var_name not in fetch_names:
-                        continue
-                values1 = output_var_cache[op_output_var_name]
-                values2 = mobile_var_cache[op_output_var_name]
-                shape = get_var_shape(op_output_var_name) if check_shape else []
-                if len(values1) + len(shape) != len(values2):
-                    error_index = index
-                for i in range(len(shape)):
-                    v1 = shape[i]
-                    v2 = values2[i]
-                    if v1 != v2:
-                        error_index = index
-                        break
-                if error_index == None:
-                    for i in range(len(values1)):
-                        v1 = values1[i]
-                        v2 = values2[len(shape) + i]
-                        if ((not math.isnan(v1)) and math.isnan(v2)) or abs(v1 - v2) > diff_threshold:
-                            error_index = index
-                            break
-                checked_names.append(op_output_var_name)
-                if error_index != None:
-                    error_values1 = values1
-                    error_values2 = values2
-                    break
-            if error_index == None:
-                for name in fetch_names:
-                    if name not in checked_names:
-                        error_index = -1
-                        break
-            if error_index == None:
-                pp_green("outputs are all correct", 1)
-            elif error_index == -1:
-                pp_red("outputs are missing")
-            else:
-                error_values1 = np.array(error_values1)
-                error_values2 = np.array(error_values2)
-                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-                pp_red("fluid results are : ", 1)
-                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-                pp_yellow("paddle mobile results are : ", 1)
-                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    print("")
-    print("==================================================")
-    print("")
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    push(checked_model_path)
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
-    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
-    push(mobile_src_root + "/test/build/test-net")
-    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
-    args = str(len(last_feed_var_shape))
-    for dim in last_feed_var_shape:
-        args += " " + str(dim)
-    if is_lod:
-        args += " 1"
-        args += " " + str(len(last_feed_var_lod))
-        for dim in last_feed_var_lod:
-            args += " " + str(dim)
-    else:
-        args += " 0"
-    args += " " + str(len(output_var_cache))
-    args += " " + str(1 if is_sample_step else 0)
-    if is_sample_step:
-        args += " " + str(sample_step)
-    else:
-        args += " " + str(sample_num)
-    for var_name in output_var_cache.keys():
-        args += " " + var_name
-    args += " " + str(1 if check_shape else 0)
-    if not fast_check:
-        check_mobile_results(args, False, False)
-        check_mobile_results(args, False, True)
-    check_mobile_results(args, True, False)
-    check_mobile_results(args, True, True)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/python/fluidtools/run_multi_feed.py b/mobile/tools/python/fluidtools/run_multi_feed.py
deleted file mode 100644
index 6f706a2e22..0000000000
--- a/mobile/tools/python/fluidtools/run_multi_feed.py
+++ /dev/null
@@ -1,695 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "erciyuan"
-checked_model_path = "checked_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.1
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-quantification = False
-quantification_fold = 1000
-architecture = "arm-v7a"
-# architecture = "arm-v8a"
-correct_persistable = False
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-feed_names_ = []
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    global correct_persistable
-    if correct_persistable:
-        ops = prog.current_block().ops
-        vars = prog.current_block().vars
-        for op in ops:
-            for var_name in op.output_arg_names:
-                if var_name == "fetch":
-                    continue
-                var = vars[var_name]
-                if var.persistable:
-                    pp_red("has found non-persistable output var : {}".format(var_name))
-                    var.persistable = False
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状,在model和params中一致,并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if not quantification:
-        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params,加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
-               "checked_model/model.ml".format(line))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    pp_yellow("run_model", 1)
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-
-    feed_names_.clear()
-    for feed_name in feeds:
-        feed_names_.append(feed_name)
-        pp_green(feed_name, 1)
-
-
-    pp_green(feed_names_, 1)
-
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状,放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var,全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    if len(output_var_filter) > 0:
-        for fetch_name in fetch_names:
-            output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
-    pp_green(args, 1)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net-feeds {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    for line in lines:
-        print(line)
-    # for line in lines:
-    #     if line.startswith("auto-test-debug"):
-    #         print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    fetch_diff = 0.0
-    fetch_count = 0
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        for i in range(len(values1)):
-            v1 = values1[i]
-            v2 = values2[len(shape) + i]
-            fetch_diff += abs(v1 - v2)
-            fetch_count += 1
-    if fetch_count != 0:
-        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("outputs are incorrect", 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-        if not fuse and not mem_opt:
-            pp_yellow("checking individual ops : ", 1)
-            error_index = None
-            error_values1 = None
-            error_values2 = None
-            checked_names = []
-            fetch_names = []
-            for fetch in fetches:
-                fetch_names.append(fetch.name)
-            for index in op_cache:
-                op_output_var_name, op = op_cache[index]
-                if mem_opt:
-                    found_in_fetch = False
-                    for fetch in fetches:
-                        if op_output_var_name == fetch.name:
-                            found_in_fetch = True
-                            break
-                    if not found_in_fetch:
-                        continue
-                if not op_output_var_name in output_var_cache:
-                    continue
-                if not op_output_var_name in mobile_var_cache:
-                    continue
-                if fuse or mem_opt:
-                    if op_output_var_name not in fetch_names:
-                        continue
-                values1 = output_var_cache[op_output_var_name]
-                values2 = mobile_var_cache[op_output_var_name]
-                shape = get_var_shape(op_output_var_name) if check_shape else []
-                if len(values1) + len(shape) != len(values2):
-                    error_index = index
-                for i in range(len(shape)):
-                    v1 = shape[i]
-                    v2 = values2[i]
-                    if v1 != v2:
-                        error_index = index
-                        break
-                if error_index == None:
-                    for i in range(len(values1)):
-                        v1 = values1[i]
-                        v2 = values2[len(shape) + i]
-                        if ((not math.isnan(v1)) and math.isnan(v2)) or abs(v1 - v2) > diff_threshold:
-                            error_index = index
-                            break
-                checked_names.append(op_output_var_name)
-                if error_index != None:
-                    error_values1 = values1
-                    error_values2 = values2
-                    break
-            if error_index == None:
-                for name in fetch_names:
-                    if name not in checked_names:
-                        error_index = -1
-                        break
-            if error_index == None:
-                pp_green("outputs are all correct", 1)
-            elif error_index == -1:
-                pp_red("outputs are missing")
-            else:
-                error_values1 = np.array(error_values1)
-                error_values2 = np.array(error_values2)
-                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-                pp_red("fluid results are : ", 1)
-                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-                pp_yellow("paddle mobile results are : ", 1)
-                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    print("")
-    print("==================================================")
-    print("")
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    push(checked_model_path)
-
-    pp_green(feed_names_, 1)
-    feed_names_argu = ""
-    for n in feed_names_:
-        feed_names_argu += "{}\n".format(n)
-        pp_green("feed name - {} ".format(str(n)), 1)
-        push(feed_path + "/" + str(n), "{}".format(str(n)))
-
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
-    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
-    push(mobile_src_root + "/test/build/test-net")
-    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
-    args = str(len(last_feed_var_shape))
-    for dim in last_feed_var_shape:
-        args += " " + str(dim)
-    if is_lod:
-        args += " 1"
-        args += " " + str(len(last_feed_var_lod))
-        for dim in last_feed_var_lod:
-            args += " " + str(dim)
-    else:
-        args += " 0"
-    args += " " + str(len(output_var_cache))
-    args += " " + str(1 if is_sample_step else 0)
-    if is_sample_step:
-        args += " " + str(sample_step)
-    else:
-        args += " " + str(sample_num)
-    for var_name in output_var_cache.keys():
-        args += " " + var_name
-    args += " " + str(1 if check_shape else 0)
-    if not fast_check:
-        check_mobile_results(args, False, False)
-        check_mobile_results(args, False, True)
-    check_mobile_results(args, True, False)
-    check_mobile_results(args, True, True)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/python/fluidtools/test_wrap.py b/mobile/tools/python/fluidtools/test_wrap.py
deleted file mode 100644
index 527a5a6584..0000000000
--- a/mobile/tools/python/fluidtools/test_wrap.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "yolov2"
-checked_model_path = "checked_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.05
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状,在model和params中一致,并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params,加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
-               "checked_model/model.ml".format(line))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状,放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var,全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    for fetch_name in fetch_names:
-        output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", args)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    for line in lines:
-        print(line)
-    for line in lines:
-        if line.startswith("auto-test-debug"):
-            print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-            error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    print("")
-    print("==================================================")
-    print("")
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    push(checked_model_path)
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/arm-v7a/build/libpaddle-mobile.so")
-    push(mobile_src_root + "/build/release/arm-v7a/build/cl_kernel")
-    push(mobile_src_root + "/test/build/test-wrap")
-    res = sh("adb shell 'cd {} && export LD_LIBRARY_PATH=. && ./test-wrap'".format(mobile_exec_root))
-    lines = res.split("\n")
-    for line in lines:
-        print(line)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/python/imagetools/README.md b/mobile/tools/python/imagetools/README.md
deleted file mode 100644
index 91106c8008..0000000000
--- a/mobile/tools/python/imagetools/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# imagetools
-
-This directory contains scripts generating input data file for paddle-mobile. The image data `g_test_image_1x3x224x224_banana` (used by `test/net/test_mobilenet.cpp`) of [http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip) is generated by this script.
-
-## Generate Input
-
-Edit script `img2nchw.py` as below according to your need:
-
-```python
-if __name__ == "__main__":
-    # set paras
-    input_image_path = 'banana.jpg'
-    reshape_dict = {"n":1, "c":3, "h":48, "w":512}
-    output_path = input_image_path.replace(input_image_path[-4:],
-                                           "_" + "_".join([str(reshape_dict['n']),
-                                                           str(reshape_dict['c']),
-                                                           str(reshape_dict['h']),
-                                                           str(reshape_dict['w']),
-                                                           "nchw",
-                                                           "float"],))
-    channel_type = ChannelType.BGR
-    mean_bgr = (103.94, 116.78, 123.68) # (0, 0, 0)
-    pixel_scale = 0.017
-```
diff --git a/mobile/tools/python/imagetools/imagetools.py b/mobile/tools/python/imagetools/imagetools.py
deleted file mode 100644
index 2d0864d729..0000000000
--- a/mobile/tools/python/imagetools/imagetools.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-
-
-def resize_take_rgbs(path, shape_h_w, SHOW_IMG=False):
-    print("[INFO] ---- resize_take_rgbs ---- start")
-
-    image = cv2.imread(path)
-    print("[INFO] image.shape:{}".format(image.shape))
-    print("[INFO] shape_h_w:{}".format(shape_h_w))
-
-    if SHOW_IMG:
-        cv2.imshow("before", image)
-
-    print_rgb(image[0, 0])
-    # image len may be for .just check it
-    # image.resize(shape_h_w)
-
-    image = cv2.resize(image, (shape_h_w[0], shape_h_w[1]))
-
-    if SHOW_IMG:
-        cv2.imshow("after", image)
-
-    print("[INFO] resized image.shape:{}".format(image.shape))
-    height = shape_h_w[0]
-    width = shape_h_w[1]
-
-    rs_ = []
-    gs_ = []
-    bs_ = []
-    for h in range(0, height):
-        for w in range(0, width):
-            '''
-            bs_.append(image[h, w, 0])
-            gs_.append(image[h, w, 1])
-            rs_.append(image[h, w, 2])
-            '''
-            bs_.append(image[w, h, 0])
-            gs_.append(image[w, h, 1])
-            rs_.append(image[w, h, 2])
-
-    # print image[2, 2, 0]/255.
-    print len(bs_)
-    print len(gs_)
-    print len(rs_)
-    print("[INFO] ---- resize_take_rgbs ---- end")
-    return bs_, gs_, rs_
-
-
-def print_rgb((b, g, r)):
-    print "像素 - R:%d,G:%d,B:%d" % (r, g, b)  # 显示像素值
-    #
-    # image[0, 0] = (100, 150, 200)  # 更改位置(0,0)处的像素
-    #
-    # (b, g, r) = image[0, 0]  # 再次读取(0,0)像素
-    # print "位置(0,0)处的像素 - 红:%d,绿:%d,蓝:%d" % (r, g, b)  # 显示更改后的像素值
-    #
-    # corner = image[0:100, 0:100]  # 读取像素块
-    # cv2.imshow("Corner", corner)  # 显示读取的像素块
-    #
-    # image[0:100, 0:100] = (0, 255, 0);  # 更改读取的像素块
-    #
-    # cv2.imshow("Updated", image)  # 显示图像
-    #
-    # cv2.waitKey(0)  # 程序暂停
-
-
-def save_to_file(to_file_name, array):
-    with open(to_file_name, "wb") as file_handle:
-        array.tofile(file_handle)
diff --git a/mobile/tools/python/imagetools/img2nchw.py b/mobile/tools/python/imagetools/img2nchw.py
deleted file mode 100644
index f8e7c74a9d..0000000000
--- a/mobile/tools/python/imagetools/img2nchw.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-import imagetools as tools
-from enum import Enum
-
-
-class ChannelType(Enum):
-    RGB = 0,
-    BGR = 1
-
-def combine_bgrs_nchw(bgrs, means_b_g_r=(103.94, 116.78, 123.68), scale=0.017, channel_type=ChannelType.BGR):
-    print("[INFO] ---- combine_bgrs_nchw ---- start")
-    print("[INFO] scale:{}".format(scale))
-    print("[INFO] mean_b_g_r:{}".format(means_b_g_r))
-    #print("[INFO] bgrs:{}".format(bgrs))
-
-    bs = bgrs[0]
-    gs = bgrs[1]
-    rs = bgrs[2]
-    assert len(bs) == len(gs) == len(rs)
-    print("[INFO] element size of blue channel = len(bs) = {}".format(len(bs)))
-
-    bgrs_float_array = array('f')
-    if channel_type == ChannelType.BGR:
-        print('[INFO] bgr format')
-        for i in range(0, len(bs)):
-            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-        for i in range(0, len(gs)):
-            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        for i in range(0, len(rs)):
-            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-    elif channel_type == ChannelType.RGB:
-        print('[INFO] rgb format')
-        for i in range(0, len(rs)):
-            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-        for i in range(0, len(gs)):
-            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        for i in range(0, len(bs)):
-            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-
-    '''
-    print("lenI(bgrs_float_array)={}".format(len(bgrs_float_array)))
-    print '------------------'
-    print bgrs_float_array[0]
-    print bgrs_float_array[224 * 224 * 2 + 224 * 2 + 2]
-    # for i in range(0, 9):
-    #     print'bs %d' % i
-    #     print bs[i] / 255.
-    print bs[224 * 2 + 2] / 255.
-    '''
-    print("[INFO] ---- combine_bgrs_nchw ---- end")
-    return bgrs_float_array
-
-
-if __name__ == "__main__":
-    # set paras
-    #input_image_path = 'banana.jpg'
-    #input_image_path = "ocr_detect_512x512.png"
-    input_image_path = "ocr_recog_48x512.png"
-
-    reshape_dict = {"n":1, "c":3, "h":48, "w":512}
-    output_path = input_image_path.replace(input_image_path[-4:],
-                                           "_" + "_".join([str(reshape_dict['n']),
-                                                           str(reshape_dict['c']),
-                                                           str(reshape_dict['h']),
-                                                           str(reshape_dict['w']),
-                                                           "nchw",
-                                                           "float"],))
-    channel_type = ChannelType.BGR
-    mean_bgr = (103.94, 116.78, 123.68)
-    pixel_scale = 0.017
-    #mean_bgr = (0, 0, 0)
-    #pixel_scale = 1. / 255
-
-    print("[INFO] input_image_path:{}".format(input_image_path))
-    print("[INFO] reshape_dict:{}".format(reshape_dict))
-    print("[INFO] output_path:{}".format(output_path))
-    print("[INFO] mean_bgr:{}".format(mean_bgr))
-    print("[INFO] pixel_scale:{}".format(pixel_scale))
-
-    bgrs = tools.resize_take_rgbs(input_image_path, (reshape_dict['h'],
-                                                     reshape_dict['w'],
-                                                     reshape_dict['c']))
-    array = combine_bgrs_nchw(bgrs, mean_bgr, pixel_scale, channel_type)
-    tools.save_to_file(output_path, array)
-    print("[INFO] save {} successfully".format(output_path))
-    #cv2.waitKey(0)
diff --git a/mobile/tools/python/imagetools/img2nhwc.py b/mobile/tools/python/imagetools/img2nhwc.py
deleted file mode 100644
index c982fe303e..0000000000
--- a/mobile/tools/python/imagetools/img2nhwc.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-import imagetools as tools
-
-
-def combine_bgrs_nhwc(bgrs, means_b_g_r, scale):
-    print "scale: %f" % scale
-    print means_b_g_r
-    # print len(bgrs)
-    bs = bgrs[0]
-    gs = bgrs[1]
-    rs = bgrs[2]
-    assert len(bs) == len(gs) == len(rs)
-    # print len(bs)
-    bgrs_float_array = array('f')
-    for i in range(0, len(bs)):
-        bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-        bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-
-    print len(bgrs_float_array)
-
-    print '------------------'
-    print bgrs_float_array[0]
-    print bgrs_float_array[999]
-    return bgrs_float_array
-
-
-bgrs = tools.resize_take_rgbs('newyolo_1.jpg', (416, 416, 3))
-array = combine_bgrs_nhwc(bgrs, (0, 0, 0), 1.0 / 255)
-tools.save_to_file('desktop_1_3_416_416_nhwc_float', array)
-
-cv2.waitKey(0)
diff --git a/mobile/tools/python/imagetools/numpy2binary.py b/mobile/tools/python/imagetools/numpy2binary.py
deleted file mode 100644
index 9d9a7d0c86..0000000000
--- a/mobile/tools/python/imagetools/numpy2binary.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-# coding=utf-8
-
-# This script convert numpy format to binary's
-import cv2
-import numpy as np
-import imagetools as tools
-from array import array
-
-
-'''
-image = cv2.imread(path)
-print image.shape
-print_rgb(image[0, 0])
-# mage len may be for .just check it
-image.resize(shape_h_w)
-'''
-
-if __name__ == "__main__":
-    # input params
-    reshape_dict = {"n": 1, "c": 3, "h": 224, "w": 224}
-    np_file_path = 'banana_1_3_224_224_nchw_float'
-    save_file_name = 'in_put_1_3_224_224_nchw'
-
-    # load input etc.
-    np = np.fromfile(np_file_path, 'f')
-    #np = cv2.imread(np_file_path)
-    print("np.size:{}".format(np.size))
-    print("np:{}".format(np))
-    np.reshape(reshape_dict['n'],
-               reshape_dict['c'],
-               reshape_dict['h'],
-               reshape_dict['w'])
-    out_array = array('f')
-
-    '''
-    print("--------------------")
-    print("np.size:{}".format(np.size))
-    print("np[0]:{}".format(np[0])
-
-    print("如果是nhw")
-    # rgb rgb rgb rgb rgb
-    print np[224 * 3 * 2 + 3 * 2 + 2]
-    # print np[2]
-
-    print '如果是nchw --------'
-    # rgb rgb rgb rgb rgb
-    print(np[224 * 224 * 2 + 224 * 2 + 2])
-    # print np[2]
-    # 明明是nchw
-    '''
-
-    for i in range(0, np.size):
-        out_array.append(np[i])
-
-    print("len(out_array):{}".format(len(out_array)))
-    print("out_array[224 * 224 * 2 + 224 * 2 + 2]:{}".format(out_array[224 * 224 * 2 + 224 * 2 + 2]))
-
-    # print out_array
-    tools.save_to_file(save_file_name, out_array)
diff --git a/mobile/tools/python/misc/.gitignore b/mobile/tools/python/misc/.gitignore
deleted file mode 100644
index 2414d1177a..0000000000
--- a/mobile/tools/python/misc/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-0
-1
-images
-__pycache__
diff --git a/mobile/tools/python/misc/fluidtools.py b/mobile/tools/python/misc/fluidtools.py
deleted file mode 100644
index 3032fd5490..0000000000
--- a/mobile/tools/python/misc/fluidtools.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import struct
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-fast_check = False
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-ops = None
-def check_model(model_path, dump_data_and_model):
-    check_model_impl(model_path, dump_data_and_model, True)
-    return check_model_impl(model_path, dump_data_and_model, False)
-
-def check_model_impl(model_path, dump_data_and_model, need_check):
-    global ops
-    if need_check:
-        prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    else:
-        prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model-checked", params_filename="params-checked")
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    
-    # 获取变量形状
-    def get_var_shape(var_name):
-        vars = prog.current_block().vars
-        shape = vars[var_name].desc.shape()
-        for i in range(len(shape)):
-            dim = shape[i]
-            if dim == -1:
-                shape[i] = 1
-        return shape
-    
-    # 获取输入变量形状
-    def get_feed_var_shape(var_name):
-        # 如果想写死输入形状,放开以下语句
-        # return [1, 3, 224, 224]
-        return get_var_shape(var_name)
-
-    # 生成feed的key-value对
-    def gen_feed_kv():
-        feed_kv = {}
-        for feed_name in feeds:
-            feed_shape = get_feed_var_shape(feed_name)
-            data = np.random.random(feed_shape).astype("float32")
-            feed_kv[feed_name] = data
-        return feed_kv
-
-    feed_kv = gen_feed_kv()
-
-    # 运行模型
-    def run_model(feed_kv=None):
-        if feed_kv is None:
-            feed_kv = gen_feed_kv()
-        outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-        results = []
-        for output in outputs:
-            results.append(np.array(output))
-        return results
-
-    # 获取var的数据
-    def get_var_data(var_name, feed_kv=None):
-        # 强制var为可持久化
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            v.persistable = True
-        # outputs = run_model(feed_kv=feed_kv)
-        output = np.array(fluid.global_scope().find_var(var_name).get_tensor())
-        # 恢复var的可持久化属性
-        v.persistable = persistable
-        return output
-
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if need_check and dump_data_and_model:
-        fluid.io.save_inference_model(dirname=model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model-checked", params_filename="params-checked")
-        return
-    var_cache = {}
-    # 获取每层输出的数据
-    def save_all_op_output(feed_kv=None):
-        output_path = "{}/data".format(model_path)
-        if not os.path.exists(output_path):
-            os.mkdir(output_path)
-        ops = prog.current_block().ops
-        fetch_names = []
-        for fetch in fetches:
-            fetch_names.append(fetch.name)
-        feed_names = feeds
-        for i in range(len(ops)):
-            op = ops[i]
-            var_name = None
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-            real_var_name = None
-            if op.type == "fetch":
-                for name in op.input_arg_names:
-                    real_var_name = name
-                    if "tmp" in name:
-                        break
-            else:
-                real_var_name = var_name
-            if fast_check:
-                if var_name not in fetch_names and var_name not in feed_names:
-                    continue
-            try:
-                shape = get_var_shape(var_name)
-                var_cache[var_name] = shape
-            except:
-                pass
-            if not dump_data_and_model:
-                continue
-            try:
-                np_data = get_var_data(real_var_name, feed_kv=feed_kv)
-                index = -1
-                for i in range(len(fetch_names)):
-                    if real_var_name == fetch_names[i]:
-                        index = i
-                        break
-                if index != -1:
-                    np_data = outputs[index]
-                data = np_data.flatten().tolist()
-                file_name = var_name.replace("/", "_")
-                var_path = output_path + "/" + file_name
-                np_data.tofile(var_path)
-                # out_file = open(var_path, "wb")
-                # if var_name in feed_names:
-                #     for item in data:
-                #         out_file.write(struct.pack("d", item))
-                # else:
-                #     for item in data:
-                #         out_file.write(struct.pack("d", item))
-                # out_file.close()
-            except:
-                print("dump {} {} failed".format(op.type, var_name))
-                pass
-    save_all_op_output()
-    return var_cache
-
-if __name__ == "__main__":
-    model_path = "./1/mobilenet"
-    check_model(model_path, True)
diff --git a/mobile/tools/python/misc/ios-test-server.py b/mobile/tools/python/misc/ios-test-server.py
deleted file mode 100644
index fe2be5733e..0000000000
--- a/mobile/tools/python/misc/ios-test-server.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import qrcode
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-from flask import Flask, request, send_from_directory, jsonify, make_response
-
-# sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-# from fluidtools import run
-from fluidtools import check_model
-
-dump_data_and_model = False
-
-def get_ip_address():
-    handle = os.popen("ifconfig | grep 172 | grep inet | grep netmask | grep broadcast | cut -d \" \" -f2")
-    ip = handle.read()
-    ip = ip.strip()
-    return ip
-
-app = Flask(__name__, static_url_path='')
-
-param_precisions = [1] # 0 for float16, 1 for float32
-
-def process_model(precision, name):
-    model_dir = "./{}/{}".format(precision, name)
-    os.chdir(model_dir)
-    os.chdir("../..")
-    var_info = check_model(model_dir, dump_data_and_model)
-    return var_info
-
-def get_model_info(precision, name):
-    # model_info = {
-    #     "name": name,
-    #     "params_precision": [precision],
-    #     "fusion": [True, False],
-    #     "reuse_texture": [True, False],
-    #     "use_mps": [True, False],
-    #     "test_performance": True,
-    #     "diff_precision": 0.01,
-    #     "vars_dic": {
-    #     }
-    # }
-    model_info = {
-        "name": name,
-        "params_precision": [precision],
-        "fusion": [True],
-        "reuse_texture": [True],
-        "use_mps": [True, False],
-        "test_performance": False,
-        "diff_precision": 0.01,
-        "vars_dic": {
-        }
-    }
-    var_info = process_model(precision, name)
-    model_info["vars_dic"] = var_info
-    return model_info
-
-model_list = []
-def process_models():
-    for precision in param_precisions:
-        model_names = os.listdir("./{}".format(precision))
-        for name in model_names:
-            model_info = get_model_info(precision, name)
-            model_list.append(model_info)
-
-@app.route('/images/')
-def send_image(path):
-    return send_from_directory('images', path)
-
-@app.route('/getFile//model')
-def send_model(name):
-    precision = 1
-    return send_from_directory("{}/{}".format(precision, name), "model-checked")
-
-@app.route('/getFile//params/')
-def send_params(name, precision):
-    return send_from_directory("{}/{}".format(precision, name), "params-checked")
-
-@app.route('/getFile//data/')
-def send_data(name, var):
-    precision = 1
-    return send_from_directory("{}/{}/data".format(precision, name), var)
-
-@app.route('/getTestInfo', methods=['GET'])
-def test_info():
-    info = {"model_list": model_list}
-    return make_response(jsonify(info), 200)
-
-test_result = None
-@app.route('/putTestResult', methods=['POST'])
-def put_test_result():
-    global test_result
-    test_result = request.get_json()
-    success = True
-    for item in test_result["results"]:
-        result = item["isResultEqual"]
-        if not result:
-            success = False
-            break
-    test_result["aaa-success"] = success
-    os.popen("open -a \"/Applications/Google Chrome.app\" \"{}/showTestResult\"".format(host))
-    return make_response(jsonify({"msg": "ok"}), 200)
-
-@app.route('/showTestResult', methods=['GET'])
-def show_test_result():
-    global test_result
-    return make_response(jsonify(test_result), 200)
-
-@app.route('/', methods=['GET'])
-def home():
-    return ""
-
-host = None
-
-if __name__ == "__main__":
-    process_models()
-    host = "http://{}:8080".format(get_ip_address())
-    image = qrcode.make(host)
-    if not os.path.isdir("images"):
-        os.mkdir("images")
-    image.save("images/qrcode.png")
-    os.popen("open -a \"/Applications/Google Chrome.app\" \"{}\"".format(host))
-    app.run(host="0.0.0.0", port=8080)
diff --git a/mobile/tools/python/misc/restore-git.py b/mobile/tools/python/misc/restore-git.py
deleted file mode 100644
index c0613bcb1d..0000000000
--- a/mobile/tools/python/misc/restore-git.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import sys
-import subprocess
-
-username = ""
-email = ""
-home = ""
-desktop = "{}/Desktop".format(home)
-dir_1 = "{}/1".format(desktop)
-dir_2 = "{}/2".format(desktop)
-src_dir = dir_1
-dest_dir = dir_2
-src_mobile_dir = "{}/paddle-mobile".format(src_dir)
-dest_mobile_dir = "{}/paddle-mobile".format(dest_dir)
-
-def clone_repo(dir):
-    os.chdir(dir)
-    os.system("git clone git@github.com:{}/paddle-mobile.git".format(username))
-    os.chdir("{}/paddle-mobile".format(dir))
-    os.system("git remote add upstream git@github.com:PaddlePaddle/paddle-mobile.git")
-    os.system("git config user.name {}".format(username))
-    os.system("git config user.email {}".format(email))
-
-def get_output(command):
-    out = subprocess.Popen(command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    stdout, stderr = out.communicate()
-    return stdout.decode("utf-8").split("\n")
-
-if __name__ == "__main__":
-    # if not os.path.isdir(src_dir):
-    #     print("dir 1 not found")
-    #     sys.exit(-1)
-    
-    if not os.path.isdir(dest_dir):
-        os.mkdir(dest_dir)
-    if not os.path.isdir(dest_mobile_dir):
-        clone_repo(dest_dir)
-    sys.exit()
-    
-    items = []
-    # items = ["metal/.gitignore", "metal/VideoSuperResolution"]
-    os.chdir(src_mobile_dir)
-    for line in get_output("git status --porcelain"):
-        line = line.strip()
-        items.append(line.split(" ")[-1])
-    
-    for item in items:
-        src = item
-        if len(src) <= 0:
-            continue
-        dest = dest_mobile_dir + "/" + item
-        cmd = "cp -R " + src + " " + dest
-        print(cmd)
-        os.system(cmd)
diff --git a/mobile/tools/python/misc/test-fluid-op-feature.py b/mobile/tools/python/misc/test-fluid-op-feature.py
deleted file mode 100644
index 1657fd2477..0000000000
--- a/mobile/tools/python/misc/test-fluid-op-feature.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import numpy as np
-import paddle.fluid as fluid
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-data = np.array([5.0])
-x = fluid.layers.data(name="x", shape=[1], dtype="float32")
-y = fluid.layers.relu6(x, threshold=4.0)
-
-prog = fluid.default_main_program()
-outputs = exe.run(prog, feed={"x": data}, fetch_list=[y])
-print(outputs)
diff --git a/mobile/tools/python/modeltools/.gitignore b/mobile/tools/python/modeltools/.gitignore
deleted file mode 100644
index 4108f5244b..0000000000
--- a/mobile/tools/python/modeltools/.gitignore
+++ /dev/null
@@ -1,109 +0,0 @@
-# Created by .ignore support plugin (hsz.mobi)
-### Python template
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-
-/yolo/datas/
-/mobilenet/datas/
diff --git a/mobile/tools/python/modeltools/core/__init__.py b/mobile/tools/python/modeltools/core/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/tools/python/modeltools/core/framework.proto b/mobile/tools/python/modeltools/core/framework.proto
deleted file mode 100644
index 07bfef1c2a..0000000000
--- a/mobile/tools/python/modeltools/core/framework.proto
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-package paddle_mobile.framework.proto;
-
-enum AttrType {
-  INT = 0;
-  FLOAT = 1;
-  STRING = 2;
-  INTS = 3;
-  FLOATS = 4;
-  STRINGS = 5;
-  BOOLEAN = 6;
-  BOOLEANS = 7;
-  BLOCK = 8;
-  LONG = 9;
-}
-
-// OpDesc describes an instance of a C++ framework::OperatorBase
-// derived class type.
-message OpDesc {
-
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    optional int32 i = 3;
-    optional float f = 4;
-    optional string s = 5;
-    repeated int32 ints = 6;
-    repeated float floats = 7;
-    repeated string strings = 8;
-    optional bool b = 10;
-    repeated bool bools = 11;
-    optional int32 block_idx = 12;
-    optional int64 l = 13;
-  };
-
-  message Var {
-    required string parameter = 1;
-    repeated string arguments = 2;
-  };
-
-  required string type = 3;
-  repeated Var inputs = 1;
-  repeated Var outputs = 2;
-  repeated Attr attrs = 4;
-  optional bool is_target = 5 [ default = false ];
-};
-
-// OpProto describes a C++ framework::OperatorBase derived class.
-message OpProto {
-
-  // VarProto describes the C++ type framework::Variable.
-  message Var {
-    required string name = 1;
-    required string comment = 2;
-
-    optional bool duplicable = 3 [ default = false ];
-    optional bool intermediate = 4 [ default = false ];
-    optional bool dispensable = 5 [ default = false ];
-  }
-
-  // AttrProto describes the C++ type Attribute.
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    required string comment = 3;
-    // If that attribute is generated, it means the Paddle third
-    // language binding has responsibility to fill that
-    // attribute. End-User should not set that attribute.
-    optional bool generated = 4 [ default = false ];
-  }
-
-  required string type = 1;
-  repeated Var inputs = 2;
-  repeated Var outputs = 3;
-  repeated Attr attrs = 4;
-  required string comment = 5;
-}
-
-message VarType {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR = 7;
-    SELECTED_ROWS = 8;
-    FEED_MINIBATCH = 9;
-    FETCH_LIST = 10;
-    STEP_SCOPES = 11;
-    LOD_RANK_TABLE = 12;
-    LOD_TENSOR_ARRAY = 13;
-    PLACE_LIST = 14;
-    READER = 15;
-    CHANNEL = 16;
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW = 17;
-    TUPLE = 18;
-  }
-
-  required Type type = 1;
-
-  message TensorDesc {
-    // Should only be PODType. Is enforced in C++
-    required Type data_type = 1;
-    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  }
-  optional TensorDesc selected_rows = 2;
-
-  message LoDTensorDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorDesc lod_tensor = 3;
-
-  message LoDTensorArrayDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorArrayDesc tensor_array = 4;
-
-  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
-  optional ReaderDesc reader = 5;
-
-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
-
-  message Tuple { repeated Type element_type = 1; }
-  optional Tuple tuple = 7;
-}
-
-message VarDesc {
-  required string name = 1;
-  required VarType type = 2;
-  optional bool persistable = 3 [ default = false ];
-}
-
-message BlockDesc {
-  required int32 idx = 1;
-  required int32 parent_idx = 2;
-  repeated VarDesc vars = 3;
-  repeated OpDesc ops = 4;
-  optional int32 forward_block_idx = 5 [ default = -1 ];
-}
-
-// Please refer to
-// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-// for more details.
-// TODO(panyx0718): A model can have multiple programs. Need a
-// way to distinguish them. Maybe ID or name?
-message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/mobile/tools/python/modeltools/core/framework_pb2.py b/mobile/tools/python/modeltools/core/framework_pb2.py
deleted file mode 100644
index 3a43deebc9..0000000000
--- a/mobile/tools/python/modeltools/core/framework_pb2.py
+++ /dev/null
@@ -1,1141 +0,0 @@
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: framework.proto
-
-import sys
-_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
-from google.protobuf.internal import enum_type_wrapper
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import message as _message
-from google.protobuf import reflection as _reflection
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf import descriptor_pb2
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-
-
-
-DESCRIPTOR = _descriptor.FileDescriptor(
-  name='framework.proto',
-  package='paddle_mobile.framework.proto',
-  syntax='proto2',
-  serialized_pb=_b('\n\x0f\x66ramework.proto\x12\x1dpaddle_mobile.framework.proto\"\xe5\x03\n\x06OpDesc\x12\x0c\n\x04type\x18\x03 \x02(\t\x12\x39\n\x06inputs\x18\x01 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12:\n\x07outputs\x18\x02 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12\x39\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpDesc.Attr\x12\x18\n\tis_target\x18\x05 \x01(\x08:\x05\x66\x61lse\x1a\xd3\x01\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\t\n\x01i\x18\x03 \x01(\x05\x12\t\n\x01\x66\x18\x04 \x01(\x02\x12\t\n\x01s\x18\x05 \x01(\t\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0e\n\x06\x66loats\x18\x07 \x03(\x02\x12\x0f\n\x07strings\x18\x08 \x03(\t\x12\t\n\x01\x62\x18\n \x01(\x08\x12\r\n\x05\x62ools\x18\x0b \x03(\x08\x12\x11\n\tblock_idx\x18\x0c \x01(\x05\x12\t\n\x01l\x18\r \x01(\x03\x1a+\n\x03Var\x12\x11\n\tparameter\x18\x01 \x02(\t\x12\x11\n\targuments\x18\x02 \x03(\t\"\xcf\x03\n\x07OpProto\x12\x0c\n\x04type\x18\x01 \x02(\t\x12:\n\x06inputs\x18\x02 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12;\n\x07outputs\x18\x03 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12:\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32+.paddle_mobile.framework.proto.OpProto.Attr\x12\x0f\n\x07\x63omment\x18\x05 \x02(\t\x1ax\n\x03Var\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0f\n\x07\x63omment\x18\x02 \x02(\t\x12\x19\n\nduplicable\x18\x03 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cintermediate\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1a\n\x0b\x64ispensable\x18\x05 \x01(\x08:\x05\x66\x61lse\x1av\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\x0f\n\x07\x63omment\x18\x03 \x02(\t\x12\x18\n\tgenerated\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xb9\n\n\x07VarType\x12\x39\n\x04type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12H\n\rselected_rows\x18\x02 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12H\n\nlod_tensor\x18\x03 \x01(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x12O\n\x0ctensor_array\x18\x04 \x01(\x0b\x32\x39.paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc\x12\x41\n\x06reader\x18\x05 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.ReaderDesc\x12\x43\n\x07\x63hannel\x18\x06 \x01(\x0b\x32\x32.paddle_mobile.framework.proto.VarType.ChannelDesc\x12;\n\x05tuple\x18\x07 \x01(\x0b\x32,.paddle_mobile.framework.proto.VarType.Tuple\x1aZ\n\nTensorDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x1ah\n\rLoDTensorDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1am\n\x12LoDTensorArrayDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1aV\n\nReaderDesc\x12H\n\nlod_tensor\x18\x01 \x03(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x1a_\n\x0b\x43hannelDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x10\n\x08\x63\x61pacity\x18\x02 \x02(\x03\x1aJ\n\x05Tuple\x12\x41\n\x0c\x65lement_type\x18\x01 \x03(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\"\x8e\x02\n\x04Type\x12\x08\n\x04\x42OOL\x10\x00\x12\t\n\x05INT16\x10\x01\x12\t\n\x05INT32\x10\x02\x12\t\n\x05INT64\x10\x03\x12\x08\n\x04\x46P16\x10\x04\x12\x08\n\x04\x46P32\x10\x05\x12\x08\n\x04\x46P64\x10\x06\x12\x0e\n\nLOD_TENSOR\x10\x07\x12\x11\n\rSELECTED_ROWS\x10\x08\x12\x12\n\x0e\x46\x45\x45\x44_MINIBATCH\x10\t\x12\x0e\n\nFETCH_LIST\x10\n\x12\x0f\n\x0bSTEP_SCOPES\x10\x0b\x12\x12\n\x0eLOD_RANK_TABLE\x10\x0c\x12\x14\n\x10LOD_TENSOR_ARRAY\x10\r\x12\x0e\n\nPLACE_LIST\x10\x0e\x12\n\n\x06READER\x10\x0f\x12\x0b\n\x07\x43HANNEL\x10\x10\x12\x07\n\x03RAW\x10\x11\x12\t\n\x05TUPLE\x10\x12\"i\n\x07VarDesc\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x34\n\x04type\x18\x02 \x02(\x0b\x32&.paddle_mobile.framework.proto.VarType\x12\x1a\n\x0bpersistable\x18\x03 \x01(\x08:\x05\x66\x61lse\"\xb5\x01\n\tBlockDesc\x12\x0b\n\x03idx\x18\x01 \x02(\x05\x12\x12\n\nparent_idx\x18\x02 \x02(\x05\x12\x34\n\x04vars\x18\x03 \x03(\x0b\x32&.paddle_mobile.framework.proto.VarDesc\x12\x32\n\x03ops\x18\x04 \x03(\x0b\x32%.paddle_mobile.framework.proto.OpDesc\x12\x1d\n\x11\x66orward_block_idx\x18\x05 \x01(\x05:\x02-1\"G\n\x0bProgramDesc\x12\x38\n\x06\x62locks\x18\x01 \x03(\x0b\x32(.paddle_mobile.framework.proto.BlockDesc*}\n\x08\x41ttrType\x12\x07\n\x03INT\x10\x00\x12\t\n\x05\x46LOAT\x10\x01\x12\n\n\x06STRING\x10\x02\x12\x08\n\x04INTS\x10\x03\x12\n\n\x06\x46LOATS\x10\x04\x12\x0b\n\x07STRINGS\x10\x05\x12\x0b\n\x07\x42OOLEAN\x10\x06\x12\x0c\n\x08\x42OOLEANS\x10\x07\x12\t\n\x05\x42LOCK\x10\x08\x12\x08\n\x04LONG\x10\tB\x02H\x03')
-)
-_sym_db.RegisterFileDescriptor(DESCRIPTOR)
-
-_ATTRTYPE = _descriptor.EnumDescriptor(
-  name='AttrType',
-  full_name='paddle_mobile.framework.proto.AttrType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='INT', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FLOAT', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STRING', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INTS', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FLOATS', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STRINGS', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BOOLEAN', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BOOLEANS', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BLOCK', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LONG', index=9, number=9,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=2708,
-  serialized_end=2833,
-)
-_sym_db.RegisterEnumDescriptor(_ATTRTYPE)
-
-AttrType = enum_type_wrapper.EnumTypeWrapper(_ATTRTYPE)
-INT = 0
-FLOAT = 1
-STRING = 2
-INTS = 3
-FLOATS = 4
-STRINGS = 5
-BOOLEAN = 6
-BOOLEANS = 7
-BLOCK = 8
-LONG = 9
-
-
-_VARTYPE_TYPE = _descriptor.EnumDescriptor(
-  name='Type',
-  full_name='paddle_mobile.framework.proto.VarType.Type',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='BOOL', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT16', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT32', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT64', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP16', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP32', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP64', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_TENSOR', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='SELECTED_ROWS', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FEED_MINIBATCH', index=9, number=9,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FETCH_LIST', index=10, number=10,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STEP_SCOPES', index=11, number=11,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_RANK_TABLE', index=12, number=12,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_TENSOR_ARRAY', index=13, number=13,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PLACE_LIST', index=14, number=14,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='READER', index=15, number=15,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='CHANNEL', index=16, number=16,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='RAW', index=17, number=17,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TUPLE', index=18, number=18,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=2072,
-  serialized_end=2342,
-)
-_sym_db.RegisterEnumDescriptor(_VARTYPE_TYPE)
-
-
-_OPDESC_ATTR = _descriptor.Descriptor(
-  name='Attr',
-  full_name='paddle_mobile.framework.proto.OpDesc.Attr',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpDesc.Attr.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpDesc.Attr.type', index=1,
-      number=2, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='i', full_name='paddle_mobile.framework.proto.OpDesc.Attr.i', index=2,
-      number=3, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='f', full_name='paddle_mobile.framework.proto.OpDesc.Attr.f', index=3,
-      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='s', full_name='paddle_mobile.framework.proto.OpDesc.Attr.s', index=4,
-      number=5, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ints', full_name='paddle_mobile.framework.proto.OpDesc.Attr.ints', index=5,
-      number=6, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='floats', full_name='paddle_mobile.framework.proto.OpDesc.Attr.floats', index=6,
-      number=7, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='strings', full_name='paddle_mobile.framework.proto.OpDesc.Attr.strings', index=7,
-      number=8, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='b', full_name='paddle_mobile.framework.proto.OpDesc.Attr.b', index=8,
-      number=10, type=8, cpp_type=7, label=1,
-      has_default_value=False, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='bools', full_name='paddle_mobile.framework.proto.OpDesc.Attr.bools', index=9,
-      number=11, type=8, cpp_type=7, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='block_idx', full_name='paddle_mobile.framework.proto.OpDesc.Attr.block_idx', index=10,
-      number=12, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='l', full_name='paddle_mobile.framework.proto.OpDesc.Attr.l', index=11,
-      number=13, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=280,
-  serialized_end=491,
-)
-
-_OPDESC_VAR = _descriptor.Descriptor(
-  name='Var',
-  full_name='paddle_mobile.framework.proto.OpDesc.Var',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='parameter', full_name='paddle_mobile.framework.proto.OpDesc.Var.parameter', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='arguments', full_name='paddle_mobile.framework.proto.OpDesc.Var.arguments', index=1,
-      number=2, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=493,
-  serialized_end=536,
-)
-
-_OPDESC = _descriptor.Descriptor(
-  name='OpDesc',
-  full_name='paddle_mobile.framework.proto.OpDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpDesc.type', index=0,
-      number=3, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='inputs', full_name='paddle_mobile.framework.proto.OpDesc.inputs', index=1,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='outputs', full_name='paddle_mobile.framework.proto.OpDesc.outputs', index=2,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='attrs', full_name='paddle_mobile.framework.proto.OpDesc.attrs', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='is_target', full_name='paddle_mobile.framework.proto.OpDesc.is_target', index=4,
-      number=5, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_OPDESC_ATTR, _OPDESC_VAR, ],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=51,
-  serialized_end=536,
-)
-
-
-_OPPROTO_VAR = _descriptor.Descriptor(
-  name='Var',
-  full_name='paddle_mobile.framework.proto.OpProto.Var',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpProto.Var.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Var.comment', index=1,
-      number=2, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='duplicable', full_name='paddle_mobile.framework.proto.OpProto.Var.duplicable', index=2,
-      number=3, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='intermediate', full_name='paddle_mobile.framework.proto.OpProto.Var.intermediate', index=3,
-      number=4, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dispensable', full_name='paddle_mobile.framework.proto.OpProto.Var.dispensable', index=4,
-      number=5, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=762,
-  serialized_end=882,
-)
-
-_OPPROTO_ATTR = _descriptor.Descriptor(
-  name='Attr',
-  full_name='paddle_mobile.framework.proto.OpProto.Attr',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpProto.Attr.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpProto.Attr.type', index=1,
-      number=2, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Attr.comment', index=2,
-      number=3, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='generated', full_name='paddle_mobile.framework.proto.OpProto.Attr.generated', index=3,
-      number=4, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=884,
-  serialized_end=1002,
-)
-
-_OPPROTO = _descriptor.Descriptor(
-  name='OpProto',
-  full_name='paddle_mobile.framework.proto.OpProto',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpProto.type', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='inputs', full_name='paddle_mobile.framework.proto.OpProto.inputs', index=1,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='outputs', full_name='paddle_mobile.framework.proto.OpProto.outputs', index=2,
-      number=3, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='attrs', full_name='paddle_mobile.framework.proto.OpProto.attrs', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.comment', index=4,
-      number=5, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_OPPROTO_VAR, _OPPROTO_ATTR, ],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=539,
-  serialized_end=1002,
-)
-
-
-_VARTYPE_TENSORDESC = _descriptor.Descriptor(
-  name='TensorDesc',
-  full_name='paddle_mobile.framework.proto.VarType.TensorDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.data_type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dims', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.dims', index=1,
-      number=2, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1501,
-  serialized_end=1591,
-)
-
-_VARTYPE_LODTENSORDESC = _descriptor.Descriptor(
-  name='LoDTensorDesc',
-  full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.tensor', index=0,
-      number=1, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.lod_level', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1593,
-  serialized_end=1697,
-)
-
-_VARTYPE_LODTENSORARRAYDESC = _descriptor.Descriptor(
-  name='LoDTensorArrayDesc',
-  full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.tensor', index=0,
-      number=1, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.lod_level', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1699,
-  serialized_end=1808,
-)
-
-_VARTYPE_READERDESC = _descriptor.Descriptor(
-  name='ReaderDesc',
-  full_name='paddle_mobile.framework.proto.VarType.ReaderDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.ReaderDesc.lod_tensor', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1810,
-  serialized_end=1896,
-)
-
-_VARTYPE_CHANNELDESC = _descriptor.Descriptor(
-  name='ChannelDesc',
-  full_name='paddle_mobile.framework.proto.VarType.ChannelDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.data_type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='capacity', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.capacity', index=1,
-      number=2, type=3, cpp_type=2, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1898,
-  serialized_end=1993,
-)
-
-_VARTYPE_TUPLE = _descriptor.Descriptor(
-  name='Tuple',
-  full_name='paddle_mobile.framework.proto.VarType.Tuple',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='element_type', full_name='paddle_mobile.framework.proto.VarType.Tuple.element_type', index=0,
-      number=1, type=14, cpp_type=8, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1995,
-  serialized_end=2069,
-)
-
-_VARTYPE = _descriptor.Descriptor(
-  name='VarType',
-  full_name='paddle_mobile.framework.proto.VarType',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.VarType.type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='selected_rows', full_name='paddle_mobile.framework.proto.VarType.selected_rows', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.lod_tensor', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='tensor_array', full_name='paddle_mobile.framework.proto.VarType.tensor_array', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='reader', full_name='paddle_mobile.framework.proto.VarType.reader', index=4,
-      number=5, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='channel', full_name='paddle_mobile.framework.proto.VarType.channel', index=5,
-      number=6, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='tuple', full_name='paddle_mobile.framework.proto.VarType.tuple', index=6,
-      number=7, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_VARTYPE_TENSORDESC, _VARTYPE_LODTENSORDESC, _VARTYPE_LODTENSORARRAYDESC, _VARTYPE_READERDESC, _VARTYPE_CHANNELDESC, _VARTYPE_TUPLE, ],
-  enum_types=[
-    _VARTYPE_TYPE,
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1005,
-  serialized_end=2342,
-)
-
-
-_VARDESC = _descriptor.Descriptor(
-  name='VarDesc',
-  full_name='paddle_mobile.framework.proto.VarDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.VarDesc.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.VarDesc.type', index=1,
-      number=2, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='persistable', full_name='paddle_mobile.framework.proto.VarDesc.persistable', index=2,
-      number=3, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2344,
-  serialized_end=2449,
-)
-
-
-_BLOCKDESC = _descriptor.Descriptor(
-  name='BlockDesc',
-  full_name='paddle_mobile.framework.proto.BlockDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='idx', full_name='paddle_mobile.framework.proto.BlockDesc.idx', index=0,
-      number=1, type=5, cpp_type=1, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='parent_idx', full_name='paddle_mobile.framework.proto.BlockDesc.parent_idx', index=1,
-      number=2, type=5, cpp_type=1, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='vars', full_name='paddle_mobile.framework.proto.BlockDesc.vars', index=2,
-      number=3, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ops', full_name='paddle_mobile.framework.proto.BlockDesc.ops', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='forward_block_idx', full_name='paddle_mobile.framework.proto.BlockDesc.forward_block_idx', index=4,
-      number=5, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=-1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2452,
-  serialized_end=2633,
-)
-
-
-_PROGRAMDESC = _descriptor.Descriptor(
-  name='ProgramDesc',
-  full_name='paddle_mobile.framework.proto.ProgramDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='blocks', full_name='paddle_mobile.framework.proto.ProgramDesc.blocks', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2635,
-  serialized_end=2706,
-)
-
-_OPDESC_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
-_OPDESC_ATTR.containing_type = _OPDESC
-_OPDESC_VAR.containing_type = _OPDESC
-_OPDESC.fields_by_name['inputs'].message_type = _OPDESC_VAR
-_OPDESC.fields_by_name['outputs'].message_type = _OPDESC_VAR
-_OPDESC.fields_by_name['attrs'].message_type = _OPDESC_ATTR
-_OPPROTO_VAR.containing_type = _OPPROTO
-_OPPROTO_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
-_OPPROTO_ATTR.containing_type = _OPPROTO
-_OPPROTO.fields_by_name['inputs'].message_type = _OPPROTO_VAR
-_OPPROTO.fields_by_name['outputs'].message_type = _OPPROTO_VAR
-_OPPROTO.fields_by_name['attrs'].message_type = _OPPROTO_ATTR
-_VARTYPE_TENSORDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_TENSORDESC.containing_type = _VARTYPE
-_VARTYPE_LODTENSORDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE_LODTENSORDESC.containing_type = _VARTYPE
-_VARTYPE_LODTENSORARRAYDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE_LODTENSORARRAYDESC.containing_type = _VARTYPE
-_VARTYPE_READERDESC.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
-_VARTYPE_READERDESC.containing_type = _VARTYPE
-_VARTYPE_CHANNELDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_CHANNELDESC.containing_type = _VARTYPE
-_VARTYPE_TUPLE.fields_by_name['element_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_TUPLE.containing_type = _VARTYPE
-_VARTYPE.fields_by_name['type'].enum_type = _VARTYPE_TYPE
-_VARTYPE.fields_by_name['selected_rows'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
-_VARTYPE.fields_by_name['tensor_array'].message_type = _VARTYPE_LODTENSORARRAYDESC
-_VARTYPE.fields_by_name['reader'].message_type = _VARTYPE_READERDESC
-_VARTYPE.fields_by_name['channel'].message_type = _VARTYPE_CHANNELDESC
-_VARTYPE.fields_by_name['tuple'].message_type = _VARTYPE_TUPLE
-_VARTYPE_TYPE.containing_type = _VARTYPE
-_VARDESC.fields_by_name['type'].message_type = _VARTYPE
-_BLOCKDESC.fields_by_name['vars'].message_type = _VARDESC
-_BLOCKDESC.fields_by_name['ops'].message_type = _OPDESC
-_PROGRAMDESC.fields_by_name['blocks'].message_type = _BLOCKDESC
-DESCRIPTOR.message_types_by_name['OpDesc'] = _OPDESC
-DESCRIPTOR.message_types_by_name['OpProto'] = _OPPROTO
-DESCRIPTOR.message_types_by_name['VarType'] = _VARTYPE
-DESCRIPTOR.message_types_by_name['VarDesc'] = _VARDESC
-DESCRIPTOR.message_types_by_name['BlockDesc'] = _BLOCKDESC
-DESCRIPTOR.message_types_by_name['ProgramDesc'] = _PROGRAMDESC
-DESCRIPTOR.enum_types_by_name['AttrType'] = _ATTRTYPE
-
-OpDesc = _reflection.GeneratedProtocolMessageType('OpDesc', (_message.Message,), dict(
-
-  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
-    DESCRIPTOR = _OPDESC_ATTR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Attr)
-    ))
-  ,
-
-  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
-    DESCRIPTOR = _OPDESC_VAR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Var)
-    ))
-  ,
-  DESCRIPTOR = _OPDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc)
-  ))
-_sym_db.RegisterMessage(OpDesc)
-_sym_db.RegisterMessage(OpDesc.Attr)
-_sym_db.RegisterMessage(OpDesc.Var)
-
-OpProto = _reflection.GeneratedProtocolMessageType('OpProto', (_message.Message,), dict(
-
-  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
-    DESCRIPTOR = _OPPROTO_VAR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Var)
-    ))
-  ,
-
-  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
-    DESCRIPTOR = _OPPROTO_ATTR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Attr)
-    ))
-  ,
-  DESCRIPTOR = _OPPROTO,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto)
-  ))
-_sym_db.RegisterMessage(OpProto)
-_sym_db.RegisterMessage(OpProto.Var)
-_sym_db.RegisterMessage(OpProto.Attr)
-
-VarType = _reflection.GeneratedProtocolMessageType('VarType', (_message.Message,), dict(
-
-  TensorDesc = _reflection.GeneratedProtocolMessageType('TensorDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_TENSORDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.TensorDesc)
-    ))
-  ,
-
-  LoDTensorDesc = _reflection.GeneratedProtocolMessageType('LoDTensorDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_LODTENSORDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorDesc)
-    ))
-  ,
-
-  LoDTensorArrayDesc = _reflection.GeneratedProtocolMessageType('LoDTensorArrayDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_LODTENSORARRAYDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc)
-    ))
-  ,
-
-  ReaderDesc = _reflection.GeneratedProtocolMessageType('ReaderDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_READERDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ReaderDesc)
-    ))
-  ,
-
-  ChannelDesc = _reflection.GeneratedProtocolMessageType('ChannelDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_CHANNELDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ChannelDesc)
-    ))
-  ,
-
-  Tuple = _reflection.GeneratedProtocolMessageType('Tuple', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_TUPLE,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.Tuple)
-    ))
-  ,
-  DESCRIPTOR = _VARTYPE,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType)
-  ))
-_sym_db.RegisterMessage(VarType)
-_sym_db.RegisterMessage(VarType.TensorDesc)
-_sym_db.RegisterMessage(VarType.LoDTensorDesc)
-_sym_db.RegisterMessage(VarType.LoDTensorArrayDesc)
-_sym_db.RegisterMessage(VarType.ReaderDesc)
-_sym_db.RegisterMessage(VarType.ChannelDesc)
-_sym_db.RegisterMessage(VarType.Tuple)
-
-VarDesc = _reflection.GeneratedProtocolMessageType('VarDesc', (_message.Message,), dict(
-  DESCRIPTOR = _VARDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarDesc)
-  ))
-_sym_db.RegisterMessage(VarDesc)
-
-BlockDesc = _reflection.GeneratedProtocolMessageType('BlockDesc', (_message.Message,), dict(
-  DESCRIPTOR = _BLOCKDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.BlockDesc)
-  ))
-_sym_db.RegisterMessage(BlockDesc)
-
-ProgramDesc = _reflection.GeneratedProtocolMessageType('ProgramDesc', (_message.Message,), dict(
-  DESCRIPTOR = _PROGRAMDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.ProgramDesc)
-  ))
-_sym_db.RegisterMessage(ProgramDesc)
-
-
-DESCRIPTOR.has_options = True
-DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('H\003'))
-# @@protoc_insertion_point(module_scope)
diff --git a/mobile/tools/python/modeltools/core/op_types.py b/mobile/tools/python/modeltools/core/op_types.py
deleted file mode 100644
index 550f87339c..0000000000
--- a/mobile/tools/python/modeltools/core/op_types.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# coding=utf-8
-
-# mdl layers
-layer_mdl_conv = 'ConvolutionLayer'
-layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer'
-layer_mdl_relu = 'ReluLayer'
-layer_mdl_pointwise_add = 'PointwiseConvolutionLayer'
-layer_mdl_pooling = 'PoolingLayer'
-layer_mdl_softmax = 'SoftmaxLayer'
-
-# fluid ops
-op_fluid_fusion_conv_add = 'fusion_conv_add'
-op_fluid_relu = 'relu'
-op_fluid_pooling = 'pool2d'
-op_fluid_softmax = 'softmax'
-
-# dict mdk layer ---  fluid op
-mdl2fluid_op_layer_dict = {
-    layer_mdl_conv: op_fluid_fusion_conv_add,
-    layer_mdl_deepwise_conv: op_fluid_fusion_conv_add,
-    layer_mdl_relu: op_fluid_relu,
-    layer_mdl_pointwise_add: op_fluid_fusion_conv_add,
-    layer_mdl_pooling: op_fluid_pooling,
-    layer_mdl_softmax: op_fluid_softmax
-}
-
-mdl_outputs_key = "outputs"
-mdl_inputs_key = "inputs"
-mdl_weight_key = "weight"
-mdl_attrs_key = "params"
-
-# dict of mdl-input _out param  to fluid input out attrs
-fusion_conv_add_dict = {
-    mdl_inputs_key: 'Input',
-    mdl_outputs_key: 'Out',
-    mdl_weight_key: ('Filter', 'Y'),
-    mdl_attrs_key: (
-        # 'workspace_size_MB', 'use_mkldnn', 'use_cudnn', 'data_format','dilations',
-        # dilations =  [1,1]
-        'groups', 'paddings', 'strides'
-        # 'axis'
-    )
-}
-
-relu_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    # mdl_weight_key: ()
-
-}
-
-pool2d_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    # mdl_weight_key: (),
-    mdl_attrs_key: ('pooling_type', 'global_pooling')
-
-}
-
-softmax_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    mdl_weight_key: (),
-    mdl_attrs_key: ()
-}
-# mdl layers  ---  fluid ops
-op_io_dict = {
-    'fusion_conv_add': fusion_conv_add_dict,
-    'relu': relu_dict,
-    'pool2d': pool2d_dict,
-    'softmax': softmax_dict
-}
-
-# fluid attr key  ---  mdl params key
-fusion_conv_add_attrs_dict = {
-    'paddings': 'pad',
-    'strides': 'stride',
-    'groups': 'group'
-}
-
-# fluid attr key  ---  mdl params key
-pool2d_attrs_dict = {
-    'global_pooling': 'global_pooling',
-    'pooling_type': 'type'
-}
-
-
-# fluid attr key  ---  mdl params key
-fluid_attrs_type_dict = {
-    'paddings': 0,
-    'strides': 6,
-    'groups': 6
-}
diff --git a/mobile/tools/python/modeltools/mobilenet/__init__.py b/mobile/tools/python/modeltools/mobilenet/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py b/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py
deleted file mode 100644
index ca1e1f7f4d..0000000000
--- a/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# coding=utf-8
-import json
-import os
-
-from core import framework_pb2 as framework_pb2, op_types as types
-from mobilenet.swicher import Swichter
-import shutil
-
-
-def load_mdl(mdl_json_path):
-    # print('mdl json path : ' + mdl_json_path)
-    with open(mdl_json_path, 'r') as f:
-        return json.load(f)
-
-
-def create_if_not_exit(target_dir):
-    if os.path.exists(target_dir):
-        shutil.rmtree(target_dir)
-    os.makedirs(target_dir, 0777)
-
-
-class Converter:
-    'convert mdlmodel to fluidmodel'
-
-    def __init__(self, base_dir, mdl_json_path):
-        print 'base_dir:  ' + base_dir
-        self.mdl_json_path = base_dir + mdl_json_path
-        self.base_dir = base_dir
-        print mdl_json_path
-        self.source_weights_dir = self.base_dir + 'datas/sourcemodels/source_weights/'
-        self.target_weight_dir = self.base_dir + 'datas/target/target_weights/'
-
-        create_if_not_exit(self.target_weight_dir)
-
-        self.mdl_json = load_mdl(self.mdl_json_path)
-        self.program_desc = framework_pb2.ProgramDesc()
-        self.weight_list_ = []
-        self.deepwise_weight_list_ = []
-        # print(json_dick)
-        # layers = (json_dick['layer'])
-        # for layer in layers:
-        #     print(layer)
-
-    def convert(self):
-        print 'convert begin.....'
-        # add block_desc
-        block_desc = self.program_desc.blocks.add()
-        block_desc.idx = 0
-        block_desc.parent_idx = -1
-        self.package_ops(block_desc)
-        self.package_vars(block_desc)
-        print 'blocks: '
-        print self.program_desc.blocks
-        print 'convert end.....'
-        desc_serialize_to_string = self.program_desc.SerializeToString()
-
-        outputmodel_dir = self.base_dir + 'datas/target/mobilenet_classfication/'
-        if os.path.exists(outputmodel_dir):
-            shutil.rmtree(outputmodel_dir)
-        os.makedirs(outputmodel_dir, 0777)
-
-        if os.path.exists(outputmodel_dir):
-            shutil.rmtree(outputmodel_dir)
-        # create_if_not_exit(outputmodel_dir)
-
-        shutil.copytree(self.target_weight_dir, outputmodel_dir)
-
-        f = open(outputmodel_dir + "__model__", "wb")
-        f.write(desc_serialize_to_string)
-        f.close()
-
-    def package_ops(self, block_desc):
-
-        self.add_op_feed(block_desc)
-
-        # add ops with layer
-        if 'layer' in self.mdl_json:
-
-            layers_ = self.mdl_json['layer']
-            for layer in layers_:
-
-                if layer['type'] == 'SoftmaxLayer':
-                    pass
-                else:
-                    desc_ops_add = block_desc.ops.add()
-
-                    # print layer
-                    # for i in layer:
-                    #     print i
-                    if 'name' in layer:
-                        l_name = layer['name']
-                    if 'type' in layer:
-                        self.package_ops_type(desc_ops_add, layer)
-
-                    if 'weight' in layer:
-                        self.package_ops_weight2inputs(desc_ops_add, layer)
-
-                    if 'output' in layer:
-                        self.package_ops_outputs(desc_ops_add, layer)
-
-                    if 'input' in layer:
-                        self.package_ops_inputs(desc_ops_add, layer)
-
-                    self.package_ops_attrs(desc_ops_add, layer)
-
-        self.add_op_fetch(block_desc)
-
-    def add_op_feed(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('feed')
-        desc_ops_add.type = 'feed'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('data')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    def add_op_fetch(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        # todo pick last layer --> op output
-        inputs_add.arguments.append('fc7')
-        desc_ops_add.type = 'fetch'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('fetch')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    @staticmethod
-    def package_ops_attrs(desc_ops_add, layer):
-        # print l_params
-        # print desc_ops_add.type
-        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
-            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
-        elif desc_ops_add.type == types.op_fluid_relu:
-            # fusion_conv_add : attrs
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'use_mkldnn'
-            # boolean
-            attrs_add.type = 6
-            attrs_add.b = 0
-        elif desc_ops_add.type == types.op_fluid_pooling:
-            Converter.pack_pooling_attr(desc_ops_add, layer)
-            pass
-        elif desc_ops_add.type == types.op_fluid_softmax:
-            pass
-
-    @staticmethod
-    def pack_pooling_attr(desc_ops_add, layer):
-        print layer
-        l_params = layer['param']
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'paddings'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(0)
-        attrs_add.ints.append(0)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'strides'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'global_pooling'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = (l_params[types.pool2d_attrs_dict.get('global_pooling')])
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'pooling_type'
-        # 2-->STRING
-        attrs_add.type = 2
-        # 注意这里 avg but mdl is ave
-        attrs_add.s = l_params[types.pool2d_attrs_dict.get('pooling_type')]
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'ceil_mode'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'ksize'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(7)
-        attrs_add.ints.append(7)
-
-    # type: "pool2d"
-    # attrs
-    # {
-    #     name: "use_mkldnn"
-    #     type: BOOLEAN
-    #     b: false
-    # }
-    # attrs
-    # {
-    #     name: "ceil_mode"
-    #     type: BOOLEAN
-    #     b: true
-    # }
-    # attrs
-    # {
-    #     name: "use_cudnn"
-    #     type: BOOLEAN
-    #     b: true
-    # }
-    # attrs
-    # {
-    #     name: "paddings"
-    #     type: INTS
-    #     ints: 0
-    #     ints: 0
-    # }
-    # attrs
-    # {
-    #     name: "strides"
-    #     type: INTS
-    #     ints: 1
-    #     ints: 1
-    # }
-    # attrs
-    # {
-    #     name: "global_pooling"
-    #     type: BOOLEAN
-    #     b: false
-    # }
-    # attrs
-    # {
-    #     name: "data_format"
-    #     type: STRING
-    #     s: "AnyLayout"
-    # }
-    # attrs
-    # {
-    #     name: "ksize"
-    #     type: INTS
-    #     ints: 7
-    #     ints: 7
-    # }
-    # attrs
-    # {
-    #     name: "pooling_type"
-    #     type: STRING
-    #     s: "avg"
-    # }
-    # is_target: false
-
-    @staticmethod
-    def pack_fusion_conv_add_attr(desc_ops_add, layer):
-
-        # fusion_conv_add : attrs
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'workspace_size_MB'
-        # 0-->INT
-        attrs_add.type = 0
-        attrs_add.i = 4096
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'data_format'
-        # 2-->STRING
-        attrs_add.type = 2
-        attrs_add.s = 'AnyLayout'
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'dilations'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'axis'
-        # int
-        attrs_add.type = 0
-        attrs_add.i = 1
-
-        if 'param' in layer:
-            l_params = layer['param']
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'paddings'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-
-            # attrs_add = desc_ops_add.attrs.add()
-            # attrs_add.name = 'paddings'
-            # # ints
-            # attrs_add.type = 3
-            # attrs_add.ints.append(0)
-            # attrs_add.ints.append(0)
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'strides'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-
-            # attrs_add = desc_ops_add.attrs.add()
-            # attrs_add.name = 'strides'
-            # # ints
-            # attrs_add.type = 3
-            # attrs_add.ints.append(6)
-            # attrs_add.ints.append(6)
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'groups'
-            # int
-            attrs_add.type = 0
-            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
-            # attrs_add.i = 1
-
-        #
-        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
-        #     .get(types.mdl_attrs_key)
-        #
-        #
-        #
-        #
-        # # group stride padding
-        # print '----------------------'
-        # for i, val in enumerate(op_attrs_tupl):
-        #     attrs_add = desc_ops_add.attrs.add()
-        #     attr_name = op_attrs_tupl[i]
-        #     print attr_name
-        #     attrs_add.name = attr_name
-        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
-        #     attrs_add.
-        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
-
-        # for p in l_params:
-        #     attrs_add = desc_ops_add.attrs.add()
-
-    @staticmethod
-    def package_ops_inputs(desc_ops_add, layer):
-        l_inputs = layer['input']
-        for i in l_inputs:
-            inputs_add = desc_ops_add.inputs.add()
-            # print i
-            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
-            inputs_add.arguments.append(i)
-
-    @staticmethod
-    def package_ops_outputs(desc_ops_add, layer):
-        l_outputs = layer['output']
-        for o in l_outputs:
-            # print o
-            outputs_add = desc_ops_add.outputs.add()
-            dict = types.op_io_dict.get(desc_ops_add.type)
-            # print 'desc_ops_add.type:  ' + desc_ops_add.type
-            # print dict
-            outputs_add.parameter = dict.get(types.mdl_outputs_key)
-            outputs_add.arguments.append(o)
-
-    def package_ops_weight2inputs(self, desc_ops_add, layer):
-        l_weights = layer['weight']
-        for w in l_weights:
-            self.weight_list_.append(w)
-
-        if layer['type'] == types.layer_mdl_deepwise_conv:
-            # print l_weights[0]
-            self.deepwise_weight_list_.append(l_weights[0])
-
-        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
-        if op_weight_tup is not None:
-            # print len(op_weight_tup)
-            for i, val in enumerate(op_weight_tup):
-                # print i
-                # print val
-                inputs_add = desc_ops_add.inputs.add()
-                inputs_add.parameter = op_weight_tup[i]
-                inputs_add.arguments.append(l_weights[i])
-
-        # for w in l_weights:
-        #     inputs_add = desc_ops_add.inputs.add()
-        #     # print w
-        #     inputs_add.parameter = op_weight_tup[0]
-        #     inputs_add.arguments.append(w)
-
-    @staticmethod
-    def package_ops_type(desc_ops_add, layer):
-        l_type = layer['type']
-        # print l_type
-        # print mdl2fluid_op_layer_dict.get(l_type)
-        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
-
-    def package_vars(self, block_desc):
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'feed'
-        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
-        vars_add.persistable = 1
-        # fetch
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'fetch'
-        vars_add.type.type = 10  # 10 is fetch list
-        vars_add.persistable = 1
-
-        json_matrix_ = self.mdl_json['matrix']
-        # print json_matrix_
-        for j in json_matrix_:
-            vars_add = block_desc.vars.add()
-            vars_add.name = j
-            vars_add.type.type = 7  # 7 is lodtensor
-            # print j
-            tensor = vars_add.type.lod_tensor.tensor
-            tensor.data_type = 5  # 5 is FP32
-
-            # print json_matrix_
-
-            dims_of_matrix = json_matrix_.get(j)
-            # dims_size = len(dims_of_matrix)
-            # print dims_size
-
-            # if dims_size == 4:
-            #     tensor.dims.append(dims_of_matrix[0])  # N
-            #     tensor.dims.append(dims_of_matrix[3])  # C
-            #     tensor.dims.append(dims_of_matrix[1])  # H
-            #     tensor.dims.append(dims_of_matrix[2])  # W
-            # else:
-
-            # issues in mdl model filter swich n and c
-            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
-                print "deep wise issue fit:  " + j
-                tensor.dims.append(dims_of_matrix[1])
-                tensor.dims.append(dims_of_matrix[0])
-                tensor.dims.append(dims_of_matrix[2])
-                tensor.dims.append(dims_of_matrix[3])
-                print tensor.dims
-            else:
-                for dims in dims_of_matrix:
-                    # print dims
-                    tensor.dims.append(dims)
-
-            if j in self.weight_list_:
-                vars_add.persistable = 1
-                dims_size = len(dims_of_matrix)
-                # print dims_size
-                # print 'weight name : ' + j
-                Swichter().copy_add_head(
-                    self.source_weights_dir + j + '.bin',
-                    self.target_weight_dir + j
-                )
-
-                # if dims_size == 4:
-                #     # convert weight from nhwc to nchw
-                #     Swichter().nhwc2nchw_one_slice_add_head(
-                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
-                #         dims_of_matrix[0],
-                #         dims_of_matrix[1],
-                #         dims_of_matrix[2],
-                #         dims_of_matrix[3]
-                #     )
-                # else:
-                #     Swichter().copy_add_head(
-                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
-                #     )
-            else:
-                vars_add.persistable = 0
-
-
-mdl_path = "datas/sourcemodels/source_profile/mobileNetModel.json"
-base_dir = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/"
-converter = Converter(base_dir, mdl_path)
-converter.convert()
diff --git a/mobile/tools/python/modeltools/mobilenet/swicher.py b/mobile/tools/python/modeltools/mobilenet/swicher.py
deleted file mode 100644
index 90bc6d26f6..0000000000
--- a/mobile/tools/python/modeltools/mobilenet/swicher.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import os
-import shutil
-from array import array
-
-
-class Swichter:
-    def __init__(self):
-        pass
-
-    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(to_file)
-        from_file.close()
-        to_file.close()
-
-    def copy(self, from_file_name, to_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-
-    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        tmp_file = open(tmp_file_name, "wb+")
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(tmp_file)
-        tmp_file.close()
-        from_file.close()
-
-        tmp_file = open(tmp_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        tmp = tmp_file.read()
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(tmp)
-        tmp_file.close()
-        to_file.close()
-
-    def read_head(self, head_file):
-        from_file = open(head_file, "rb")
-        read = from_file.read(24)
-        # print read
-        from_file.close()
-        # print read
-        return read
-
-    def copy_add_head(self, from_file_name, to_file_name):
-
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head(
-            '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/head/head')
-        to_file.write(head)
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-        pass
-
-    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
-        print'padding  = %d' % padding
-        from_file = open(from_file_name, "rb")
-        # print len(from_file.read())
-        from_file.seek(padding, 0)
-
-        read = from_file.read()
-        print len(read)
-
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(read)
-        from_file.close()
-        to_file.close()
-        pass
-
-# Swichter().nhwc2nchw_one_slice_add_head(
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
-#     32,
-#     3, 3, 3)
-
-# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head')
-
-# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/mobile/tools/python/modeltools/tools/__init__.py b/mobile/tools/python/modeltools/tools/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/tools/python/modeltools/tools/float2halffloat.py b/mobile/tools/python/modeltools/tools/float2halffloat.py
deleted file mode 100644
index 3df8d43f95..0000000000
--- a/mobile/tools/python/modeltools/tools/float2halffloat.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# encoding:utf-8
-import math
-import re
-
-
-def Real2HalfFloat(data):
-    MINNUM = -65536
-    MAXNUM = 65535
-    FloatVal = 0
-    if data:
-        if data < MINNUM:
-            data = MINNUM
-        if data > MAXNUM:
-            data = MAXNUM
-
-        sign = 0
-        if data < 0:
-            sign = 1
-            data = -data
-
-        exp = math.floor((math.log2(data)))
-        expout = exp + 16
-
-        Mantial = round(data / pow(2, exp - 10)) - 1024
-
-        if expout <= 0:
-            FloatVal = 0
-        else:
-            FloatVal = sign * 32768 + expout * 1024 + Mantial
-    return FloatVal
-
-
-def ReadCfloatData(sourcefile):
-    input = []
-    with open(sourcfile, 'r') as f:
-        for line in f.readlines():
-            line = line.strip()
-            line = re.sub('\s+', ' ', line)  # 两个数字间多个空格
-            input.append(line.split(' '))
-    destfile = sourcefile.replace('.dat', '')
-    destfile = destfile.replace('.txt', '')
-    destfile += 'Out.dat'
-    with open(destfile, 'w') as fw:
-        for i in range(len(input)):
-            if len(input[i]) == 2:
-                real = Real2HalfFloat(float(input[i][0]))
-                imag = Real2HalfFloat(float(input[i][1]))
-                result = real * 65536 + imag
-                if imag and not real:
-                    fw.write('0x0000' + "%X" % result + '\n')
-                elif not imag and not real:
-                    fw.write('0x00000000' + '\n')
-                else:
-                    fw.write('0x' + "%X" % result + '\n')
-            elif len(input[i]) == 1:
-                result = Real2HalfFloat(float(input[i][0]))
-                if result:
-                    fw.write('0x' + "%X" % result + '\n')
-                else:
-                    fw.write('0x0000' + '\n')
-
-
-if __name__ == '__main__':
-    print('Tips: Input number 0 if you want to exit!\n')
-    while True:
-        sourcfile = input("input source file:\n")
-        if sourcfile is '0':
-            break
-        ReadCfloatData(sourcfile)
-        print('Transfer Success!')
diff --git a/mobile/tools/python/modeltools/tools/loader.py b/mobile/tools/python/modeltools/tools/loader.py
deleted file mode 100644
index 55d9cdde20..0000000000
--- a/mobile/tools/python/modeltools/tools/loader.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import json
-
-
-def loadmdl(json_path):
-    print('mdl json path : ' + json_path)
-    with open(json_path, 'r') as f:
-        json_dick = json.load(f)
-        # print(json_dick)
-        layers = (json_dick['layer'])
-        for layer in layers:
-            print(layer)
diff --git a/mobile/tools/python/modeltools/tools/model_combine.py b/mobile/tools/python/modeltools/tools/model_combine.py
deleted file mode 100644
index 1fe8e6a9cd..0000000000
--- a/mobile/tools/python/modeltools/tools/model_combine.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# coding=utf-8
-import os
-
-path = "mobilenet/"  # 文件夹目录
-to_file_path = "mobilenet_combine/params"
-files = os.listdir(path)  # 得到文件夹下的所有文件名称
-files.sort(cmp=None, key=str.lower)
-to_file = open(to_file_path, "wb")
-
-for file in files:  # 遍历文件夹
-    if not os.path.isdir(file) and file != ".DS_Store":  # 判断是否是文件夹,不是文件夹才打开
-        f = open(path + "/" + file)  # 打开文件
-        name = f.name
-        print 'name:  ' + name
-        from_file = open(name, "rb")
-        to_file.write(from_file.read())
-        from_file.close()
-
-to_file.close()
diff --git a/mobile/tools/python/modeltools/tools/model_reader.py b/mobile/tools/python/modeltools/tools/model_reader.py
deleted file mode 100644
index 5f6e5f0cb9..0000000000
--- a/mobile/tools/python/modeltools/tools/model_reader.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import os
-
-from core import framework_pb2 as framework_pb2
-
-
-def read_model(model_path):
-    print('read_model.')
-    path_8 = unicode(model_path, 'utf8')
-
-    try:
-        with open(path_8, "rb") as f_model:
-            print get_file_size(model_path)
-            desc = framework_pb2.ProgramDesc()
-            desc.ParseFromString(f_model.read())
-            print desc
-            # print desc.blocks
-
-    except IOError:
-        print ": File not found."
-
-
-def get_file_size(file_path):
-    file_path = unicode(file_path, 'utf8')
-    fsize = os.path.getsize(file_path)
-    fsize = fsize / float(1024 * 1024)
-    return round(fsize, 2)
-
-
-path = '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/mobilenet_example/mobilenet/__model__'
-read_model(path)
diff --git a/mobile/tools/python/modeltools/yolo/__init__.py b/mobile/tools/python/modeltools/yolo/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/tools/python/modeltools/yolo/mdl2fluid.py b/mobile/tools/python/modeltools/yolo/mdl2fluid.py
deleted file mode 100644
index 2c2d0f3e94..0000000000
--- a/mobile/tools/python/modeltools/yolo/mdl2fluid.py
+++ /dev/null
@@ -1,333 +0,0 @@
-import json
-
-from core import framework_pb2 as framework_pb2, op_types as types
-from yolo.swicher import Swichter
-import shutil
-
-
-def load_mdl(mdl_json_path):
-    # print('mdl json path : ' + mdl_json_path)
-    with open(mdl_json_path, 'r') as f:
-        return json.load(f)
-
-
-class Converter:
-    'convert mdlmodel to fluidmodel'
-
-    def __init__(self, mdl_json_path):
-        self.mdl_json_path = mdl_json_path
-        print mdl_json_path
-        self.mdl_json = load_mdl(self.mdl_json_path)
-        self.program_desc = framework_pb2.ProgramDesc()
-        self.weight_list_ = []
-        self.deepwise_weight_list_ = []
-        # print(json_dick)
-        # layers = (json_dick['layer'])
-        # for layer in layers:
-        #     print(layer)
-
-    def convert(self):
-        print 'convert begin.....'
-        # add block_desc
-        block_desc = self.program_desc.blocks.add()
-        block_desc.idx = 0
-        block_desc.parent_idx = -1
-        self.package_ops(block_desc)
-        self.package_vars(block_desc)
-        print 'blocks: '
-        print self.program_desc.blocks
-        print 'convert end.....'
-        desc_serialize_to_string = self.program_desc.SerializeToString()
-        shutil.rmtree('yolo/datas/newyolo/')
-        shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'yolo/datas/newyolo/')
-
-        f = open("yolo/datas/newyolo/__model__", "wb")
-        f.write(desc_serialize_to_string)
-        f.close()
-
-    def package_ops(self, block_desc):
-
-        self.add_op_feed(block_desc)
-
-        # add ops with layer
-        if 'layer' in self.mdl_json:
-
-            layers_ = self.mdl_json['layer']
-            for layer in layers_:
-                desc_ops_add = block_desc.ops.add()
-
-                # print layer
-                # for i in layer:
-                #     print i
-                if 'name' in layer:
-                    l_name = layer['name']
-                if 'type' in layer:
-                    self.package_ops_type(desc_ops_add, layer)
-
-                if 'weight' in layer:
-                    self.package_ops_weight2inputs(desc_ops_add, layer)
-
-                if 'output' in layer:
-                    self.package_ops_outputs(desc_ops_add, layer)
-
-                if 'input' in layer:
-                    self.package_ops_inputs(desc_ops_add, layer)
-
-                self.package_ops_attrs(desc_ops_add, layer)
-
-        self.add_op_fetch(block_desc)
-
-    def add_op_feed(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('feed')
-        desc_ops_add.type = 'feed'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('data')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    def add_op_fetch(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('conv_pred_87')
-        desc_ops_add.type = 'fetch'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('fetch')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    @staticmethod
-    def package_ops_attrs(desc_ops_add, layer):
-        # print l_params
-        # print desc_ops_add.type
-        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
-            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
-        elif desc_ops_add.type == types.op_fluid_relu:
-            # fusion_conv_add : attrs
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'use_mkldnn'
-            # boolean
-            attrs_add.type = 6
-            attrs_add.b = 0
-
-    @staticmethod
-    def pack_fusion_conv_add_attr(desc_ops_add, layer):
-
-        # fusion_conv_add : attrs
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'workspace_size_MB'
-        # 0-->INT
-        attrs_add.type = 0
-        attrs_add.i = 4096
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'data_format'
-        # 2-->STRING
-        attrs_add.type = 2
-        attrs_add.s = 'AnyLayout'
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'dilations'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'axis'
-        # int
-        attrs_add.type = 0
-        attrs_add.i = 1
-
-        if 'param' in layer:
-            l_params = layer['param']
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'paddings'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'strides'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'groups'
-            # int
-            attrs_add.type = 0
-            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
-            # attrs_add.i = 1
-
-        #
-        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
-        #     .get(types.mdl_attrs_key)
-        #
-        #
-        #
-        #
-        # # group stride padding
-        # print '----------------------'
-        # for i, val in enumerate(op_attrs_tupl):
-        #     attrs_add = desc_ops_add.attrs.add()
-        #     attr_name = op_attrs_tupl[i]
-        #     print attr_name
-        #     attrs_add.name = attr_name
-        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
-        #     attrs_add.
-        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
-
-        # for p in l_params:
-        #     attrs_add = desc_ops_add.attrs.add()
-
-    @staticmethod
-    def package_ops_inputs(desc_ops_add, layer):
-        l_inputs = layer['input']
-        for i in l_inputs:
-            inputs_add = desc_ops_add.inputs.add()
-            # print i
-            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
-            inputs_add.arguments.append(i)
-
-    @staticmethod
-    def package_ops_outputs(desc_ops_add, layer):
-        l_outputs = layer['output']
-        for o in l_outputs:
-            # print o
-            outputs_add = desc_ops_add.outputs.add()
-            outputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_outputs_key)
-            outputs_add.arguments.append(o)
-
-    def package_ops_weight2inputs(self, desc_ops_add, layer):
-        l_weights = layer['weight']
-        for w in l_weights:
-            self.weight_list_.append(w)
-
-        if layer['type'] == 'DepthwiseConvolutionLayer':
-            # print l_weights[0]
-            self.deepwise_weight_list_.append(l_weights[0])
-
-        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
-        # print len(op_weight_tup)
-        for i, val in enumerate(op_weight_tup):
-            # print i
-            # print val
-            inputs_add = desc_ops_add.inputs.add()
-            inputs_add.parameter = op_weight_tup[i]
-            inputs_add.arguments.append(l_weights[i])
-
-        # for w in l_weights:
-        #     inputs_add = desc_ops_add.inputs.add()
-        #     # print w
-        #     inputs_add.parameter = op_weight_tup[0]
-        #     inputs_add.arguments.append(w)
-
-    @staticmethod
-    def package_ops_type(desc_ops_add, layer):
-        l_type = layer['type']
-        # print l_type
-        # print mdl2fluid_op_layer_dict.get(l_type)
-        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
-
-    def package_vars(self, block_desc):
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'feed'
-        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
-        vars_add.persistable = 1
-        # fetch
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'fetch'
-        vars_add.type.type = 10  # 10 is fetch list
-        vars_add.persistable = 1
-
-        json_matrix_ = self.mdl_json['matrix']
-        # print json_matrix_
-        for j in json_matrix_:
-            vars_add = block_desc.vars.add()
-            vars_add.name = j
-            vars_add.type.type = 7  # 7 is lodtensor
-            # print j
-            tensor = vars_add.type.lod_tensor.tensor
-            tensor.data_type = 5  # 5 is FP32
-
-            # print json_matrix_
-
-            dims_of_matrix = json_matrix_.get(j)
-            # dims_size = len(dims_of_matrix)
-            # print dims_size
-
-            # if dims_size == 4:
-            #     tensor.dims.append(dims_of_matrix[0])  # N
-            #     tensor.dims.append(dims_of_matrix[3])  # C
-            #     tensor.dims.append(dims_of_matrix[1])  # H
-            #     tensor.dims.append(dims_of_matrix[2])  # W
-            # else:
-
-            # issues in mdl model filter swich n and c
-            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
-                print j
-                tensor.dims.append(dims_of_matrix[1])
-                tensor.dims.append(dims_of_matrix[0])
-                tensor.dims.append(dims_of_matrix[2])
-                tensor.dims.append(dims_of_matrix[3])
-                print tensor.dims
-            else:
-                for dims in dims_of_matrix:
-                    # print dims
-                    tensor.dims.append(dims)
-
-            if j in self.weight_list_:
-                vars_add.persistable = 1
-                dims_size = len(dims_of_matrix)
-                # print dims_size
-                if dims_size == 4:
-                    # convert weight from nhwc to nchw
-                    Swichter().nhwc2nchw_one_slice_add_head(
-                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
-                        dims_of_matrix[0],
-                        dims_of_matrix[1],
-                        dims_of_matrix[2],
-                        dims_of_matrix[3]
-                    )
-                else:
-                    Swichter().copy_add_head(
-                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
-                    )
-            else:
-                vars_add.persistable = 0
-
-
-mdl_path = "yolo/datas/multiobjects/YOLO_Universal.json"
-converter = Converter(mdl_path)
-converter.convert()
diff --git a/mobile/tools/python/modeltools/yolo/swicher.py b/mobile/tools/python/modeltools/yolo/swicher.py
deleted file mode 100644
index 713ce93985..0000000000
--- a/mobile/tools/python/modeltools/yolo/swicher.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from array import array
-
-
-class Swichter:
-    def __init__(self):
-        pass
-
-    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(to_file)
-        from_file.close()
-        to_file.close()
-
-    def copy(self, from_file_name, to_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-
-    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        tmp_file = open(tmp_file_name, "wb+")
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(tmp_file)
-        tmp_file.close()
-        from_file.close()
-
-        tmp_file = open(tmp_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        tmp = tmp_file.read()
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(tmp)
-        tmp_file.close()
-        to_file.close()
-
-    def read_head(self, head_file):
-        from_file = open(head_file, "rb")
-        read = from_file.read(24)
-        # print read
-        from_file.close()
-        # print read
-        return read
-
-    def copy_add_head(self, from_file_name, to_file_name, tmp_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-        pass
-
-    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
-        print'padding  = %d' % padding
-        from_file = open(from_file_name, "rb")
-        # print len(from_file.read())
-        from_file.seek(padding, 0)
-
-        read = from_file.read()
-        print len(read)
-
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(read)
-        from_file.close()
-        to_file.close()
-        pass
-
-# Swichter().nhwc2nchw_one_slice_add_head(
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
-#     32,
-#     3, 3, 3)
-
-# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head')
-
-# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/mobile/tools/quantification/CMakeLists.txt b/mobile/tools/quantification/CMakeLists.txt
deleted file mode 100644
index 13a4fb87b9..0000000000
--- a/mobile/tools/quantification/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-cmake_minimum_required(VERSION 3.6)
-project(quali)
-add_definitions(-DENABLE_EXCEPTION)
-
-set(CMAKE_CXX_STANDARD 11)
-file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
-file(GLOB_RECURSE QULIFICATON_H src/*.h)
-include_directories(. src/)
-
-#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
-
-add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
diff --git a/mobile/tools/quantification/README.md b/mobile/tools/quantification/README.md
deleted file mode 100644
index c2f9e63249..0000000000
--- a/mobile/tools/quantification/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# 模型量化脚本
-
-#### 量化脚本使用指南
-1. 在PaddleMobile项目目录下(如 ~/PaddleProject/paddle-mobile)
-
-2. cd到  tools/quantification/ 目录
-
-3. cmake编译
-
-    ``` sh
-    cmake .
-    make
-    ```
-
-4. 运行量化脚本
-    ```sh
-    ./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
-    # quantify googlenet seperated   from  /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
-    ./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min 
-
-    ```
-
-*注:*
-*量化工具中*
-*1.seperated模型model文件默认命名为 "__model__";*
-*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
-
-    
-##### 整体如下:
-以googlenet非combined为例:
-
-```sh
-cd tools/quantification/
-cmake .
-make
-./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
-```
diff --git a/mobile/tools/quantification/convert.cpp b/mobile/tools/quantification/convert.cpp
deleted file mode 100644
index 0d675de205..0000000000
--- a/mobile/tools/quantification/convert.cpp
+++ /dev/null
@@ -1,480 +0,0 @@
-
-
-#include "src/enforce.h"
-#include "src/var_desc.h"
-#include "src/program_desc.h"
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include "src/framework.pb-c.h"
-#include "src/protobuf-c.h"
-#include 
-#include 
-#include 
-
-const size_t kSize64 = sizeof(uint64_t);
-const size_t kSize32 = sizeof(uint32_t);
-const int minimal_fold_size = 2;
-float max_entropy = 0.0;
-
-float entropy(std::vector &factors) {
-    int n = factors.size();
-    std::vector counts(256);
-    for (uint8_t &factor : factors) {
-        counts[factor]++;
-    }
-    float res = 1.0;
-    float shift = 100000.0;
-    for (int i = 0; i < 256; i++) {
-        res *= (counts[i] + shift) / (n + shift);
-    }
-    return 1.0 / res;
-}
-
-char *Get_binary_data(const std::string &filename) {
-
-    FILE *file = fopen(filename.c_str(), "rb");
-
-    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                          filename.c_str());
-    fseek(file, 0, SEEK_END);
-    int64_t size = ftell(file);
-
-    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-    rewind(file);
-    auto *data = new char[size];
-    size_t bytes_read = fread(data, 1, static_cast(size), file);
-    PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                          "read binary file bytes do not match with fseek");
-    fclose(file);
-    return data;
-}
-
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-    FILE *fp;
-    fp = fopen(file_name, "rb");
-    PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
-    fseek(fp, 0, SEEK_END);
-    auto size = static_cast(ftell(fp));
-    rewind(fp);
-    *out = reinterpret_cast(malloc(size));
-    size_t cur_len = 0;
-    size_t nread;
-    while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-        cur_len += nread;
-    }
-    fclose(fp);
-    return cur_len;
-}
-
-std::shared_ptr loadParams(const std::string &model_path) {
-    PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-    uint8_t *buf = nullptr;
-    size_t read_size = ReadBuffer(model_path.c_str(), &buf);
-    PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
-    c_program = paddle_mobile__framework__proto__program_desc__unpack(
-            nullptr, read_size, buf);
-    PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
-    auto originProgramDesc = std::make_shared(c_program);
-    return originProgramDesc;
-
-}
-
-void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file, int quantification_fold) {
-    // 1. version
-    uint32_t version = *reinterpret_cast(*dataP);
-
-    // write version
-    fwrite(&version, kSize32, 1, out_file);
-
-    *dataP += kSize32;
-
-    // 2 Lod information
-    auto *lod_level_ptr = new uint64_t();
-    memcpy(lod_level_ptr, *dataP, kSize64);
-
-    uint64_t lod_level = 0;
-    // write lod Information
-    fwrite(&lod_level, kSize64, 1, out_file);
-    delete lod_level_ptr;
-
-    *dataP += kSize64;
-
-    for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size = *reinterpret_cast(*dataP);
-        // write lod size
-        fwrite(&size, kSize64, 1, out_file);
-        (*dataP) += kSize64;
-
-        std::vector tmp(size / sizeof(size_t));
-        for (unsigned long &k : tmp) {
-            k = *reinterpret_cast(*dataP);
-            (*dataP) += sizeof(size_t);
-        }
-        // write lod size vector
-        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
-    }
-
-    // 3. tensor version
-    uint32_t tensor_version = *reinterpret_cast(*dataP);
-    // write tensor version
-    fwrite(&tensor_version, kSize32, 1, out_file);
-    (*dataP) += kSize32;
-
-    // 4. tensor desc
-    int32_t size = *reinterpret_cast(*dataP);
-    // write tensor desc
-    fwrite(&size, sizeof(int32_t), 1, out_file);
-    (*dataP) += sizeof(int32_t);
-
-    std::unique_ptr buf(new char[size]);
-    for (int m = 0; m < size; ++m) {
-        buf.get()[m] = (*dataP)[m];
-    }
-
-    fwrite(buf.get(), sizeof(char), static_cast(size), out_file);
-    (*dataP) += (sizeof(char) * size);
-
-    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
-    int memory_size = 1;
-    for (auto l : desc.Dims()) {
-        memory_size *= l;
-    }
-
-    void *memory = nullptr;
-    int type_size = 0;
-    switch (desc.DataType()) {
-        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
-            type_size = 2;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
-            type_size = 1;
-            break;
-        default:
-            break;
-    }
-    size_t tensorSize = sizeof(char) * memory_size * type_size;
-
-    memory = new char[tensorSize];
-
-    for (int n = 0; n < tensorSize; ++n) {
-        static_cast(memory)[n] = (*dataP)[n];
-    }
-    *dataP += tensorSize;
-
-    quantification_fold = std::min(std::max(1, memory_size / minimal_fold_size), quantification_fold);
-    int step = std::max(memory_size / quantification_fold, 1);
-
-    int visited_fold = 0;
-    while (visited_fold * step < memory_size) {
-        // for float 32
-        float min_value = std::numeric_limits::max();
-        float max_value = std::numeric_limits::min();
-
-        for (int k = visited_fold * step; k < std::min((visited_fold + 1) * step, memory_size); ++k) {
-            min_value = std::min(min_value, static_cast (memory)[k]);
-            max_value = std::max(max_value, static_cast (memory)[k]);
-        }
-
-        fwrite(&min_value, sizeof(float), 1, out_file);
-        fwrite(&max_value, sizeof(float), 1, out_file);
-
-        std::vector factors;
-        for (int g = visited_fold * step; g < std::min((visited_fold + 1) * step, memory_size); ++g) {
-            float value = static_cast (memory)[g];
-            auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-            factors.push_back(factor);
-            fwrite(&factor, sizeof(uint8_t), 1, out_file);
-        }
-        max_entropy = fmax(max_entropy, entropy(factors));
-        visited_fold++;
-    }
-}
-
-void
-quantificate_combined_int8(const std::string &model_path, const std::string ¶m_path, const std::string ¶m_min_path, int quantification_fold) {
-    auto program = loadParams(model_path);
-    char *origin_data = Get_binary_data(param_path);
-    char *data = origin_data;
-    FILE *out_file = fopen(param_min_path.c_str(), "wb");
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                LoadWithDumpForInt8(*var_desc, &data, out_file, quantification_fold);
-            }
-        }
-    }
-    fclose(out_file);
-    delete origin_data;
-}
-
-void quantificate_seperated_int8(const std::string model_dir, const std::string param_min_path, int quantification_fold) {
-    auto program = loadParams(model_dir + "/__model__");
-
-    std::string shell_command = "mkdir " + param_min_path;
-    system(shell_command.c_str());
-
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                std::string file_name = param_min_path + "/" + var_desc->Name();
-                FILE *out_file = fopen(file_name.c_str(), "wb");
-                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
-                char *data = origin_data;
-                LoadWithDumpForInt8(*var_desc, &data, out_file, quantification_fold);
-                delete origin_data;
-                fclose(out_file);
-            }
-        }
-    }
-}
-
-void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file, int quantification_fold) {
-    // 1. version
-    uint32_t version = *reinterpret_cast(*dataP);
-
-    // write version
-    fwrite(&version, kSize32, 1, out_file);
-
-    *dataP += kSize32;
-
-    // 2 Lod information
-    auto *lod_level_ptr = new uint64_t();
-    memcpy(lod_level_ptr, *dataP, kSize64);
-
-    uint64_t lod_level = 0;
-    // write lod Information
-    fwrite(&lod_level, kSize64, 1, out_file);
-    delete lod_level_ptr;
-
-    *dataP += kSize64;
-
-    for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size = *reinterpret_cast(*dataP);
-        // write lod size
-        fwrite(&size, kSize64, 1, out_file);
-        (*dataP) += kSize64;
-
-        std::vector tmp(size / sizeof(size_t));
-        for (unsigned long &k : tmp) {
-            k = *reinterpret_cast(*dataP);
-            (*dataP) += sizeof(size_t);
-        }
-        // write lod size vector
-        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
-    }
-
-    // 3. tensor version
-    uint32_t tensor_version = *reinterpret_cast(*dataP);
-    // write tensor version
-    fwrite(&tensor_version, kSize32, 1, out_file);
-    (*dataP) += kSize32;
-
-    // 4. tensor desc
-    int32_t size = *reinterpret_cast(*dataP);
-    // write tensor desc
-    fwrite(&size, sizeof(int32_t), 1, out_file);
-    (*dataP) += sizeof(int32_t);
-
-    std::unique_ptr buf(new char[size]);
-    for (int m = 0; m < size; ++m) {
-        buf.get()[m] = (*dataP)[m];
-    }
-
-    fwrite(buf.get(), sizeof(char), static_cast(size), out_file);
-    (*dataP) += (sizeof(char) * size);
-
-    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
-    int memory_size = 1;
-    for (auto l : desc.Dims()) {
-        memory_size *= l;
-    }
-
-    void *memory = nullptr;
-    int type_size = 0;
-    switch (desc.DataType()) {
-        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
-            type_size = 2;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
-            type_size = 1;
-            break;
-        default:
-            break;
-    }
-    size_t tensorSize = sizeof(char) * memory_size * type_size;
-
-    memory = new char[tensorSize];
-
-    for (int n = 0; n < tensorSize; ++n) {
-        static_cast(memory)[n] = (*dataP)[n];
-    }
-    *dataP += tensorSize;
-
-    quantification_fold = std::min(std::max(1, memory_size / minimal_fold_size), quantification_fold);
-    int step = std::max(memory_size / quantification_fold, 1);
-
-    int visited_fold = 0;
-    while (visited_fold * step < memory_size) {
-        // for float 32
-        float min_value = std::numeric_limits::max();
-        float max_value = std::numeric_limits::min();
-
-        for (int k = visited_fold * step; k < std::min((visited_fold + 1) * step, memory_size); ++k) {
-            min_value = std::min(min_value, static_cast (memory)[k]);
-            max_value = std::max(max_value, static_cast (memory)[k]);
-        }
-
-        float diff = 0.0;
-        std::vector factors;
-        for (int g = visited_fold * step; g < std::min((visited_fold + 1) * step, memory_size); ++g) {
-            float value = static_cast (memory)[g];
-            auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-            factors.push_back(factor);
-            float value_quantized = min_value + (factor / 255.0) * (max_value - min_value);
-            diff += fabs(value - value_quantized);
-            fwrite(&value_quantized, sizeof(float), 1, out_file);
-        }
-        max_entropy = fmax(max_entropy, entropy(factors));
-        if (memory_size > 0) {
-            std::cout << "avg diff caused by quantization for var " << var_desc.Name() << " is: " << (diff / memory_size) << std::endl;
-        }
-        visited_fold++;
-    }
-}
-
-void
-quantificate_combined_float32(const std::string &model_path, const std::string ¶m_path, const std::string ¶m_min_path, int quantification_fold) {
-    auto program = loadParams(model_path);
-    char *origin_data = Get_binary_data(param_path);
-    char *data = origin_data;
-    FILE *out_file = fopen(param_min_path.c_str(), "wb");
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                LoadWithDumpForFloat32(*var_desc, &data, out_file, quantification_fold);
-            }
-        }
-    }
-    fclose(out_file);
-    delete origin_data;
-}
-
-void quantificate_seperated_float32(const std::string model_dir, const std::string param_min_path, int quantification_fold) {
-    auto program = loadParams(model_dir + "/__model__");
-
-    std::string shell_command = "mkdir " + param_min_path;
-    system(shell_command.c_str());
-
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                std::string file_name = param_min_path + "/" + var_desc->Name();
-                FILE *out_file = fopen(file_name.c_str(), "wb");
-                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
-                char *data = origin_data;
-                LoadWithDumpForFloat32(*var_desc, &data, out_file, quantification_fold);
-                delete origin_data;
-                fclose(out_file);
-            }
-        }
-    }
-}
-
-int main(int argc, char **argv) {
-    const std::string kNoteEg = "( eg:  ./quantify 1 your_combined_model_path output_path  or  ./quantify 0 your_seperated_model_path output_path  or  ./quantify 3 your_seperated_model_path output_path  or  ./quantify 2 your_seperated_model_path output_path)";
-
-    PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
-
-    std::string action_type = argv[1];
-    PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "0" || action_type == "1" || action_type == "2" || action_type == "3",
-                          "only 0, 1, 2 or 3 supported, current is %s %s ",
-                          action_type.c_str(),
-                          kNoteEg.c_str());
-
-    PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
-    std::string base_path = argv[2];
-
-    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
-    std::string output_path = argv[3];
-
-    int quantification_fold = 1;
-    if (argc > 4) {
-        quantification_fold = std::stoi(argv[4]);
-    }
-
-    if (action_type == "0") {
-        // for seperated
-        const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_int8(base_path, seperated_min_dir, quantification_fold);
-        return 0;
-    }
-
-    if (action_type == "1") {
-        // for combined
-        const std::string &combined_min_dir = output_path;
-        std::string model_path = base_path + "/model";
-        std::string param_path = base_path + "/params";
-        quantificate_combined_int8(model_path, param_path, combined_min_dir, quantification_fold);
-        std::cout << "max entropy : " << max_entropy << std::endl;
-        return 0;
-    }
-
-    if (action_type == "2") {
-        // for seperated
-        const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_float32(base_path, seperated_min_dir, quantification_fold);
-        return 0;
-    }
-
-    if (action_type == "3") {
-        // for combined
-        const std::string &combined_min_dir = output_path;
-        std::string model_path = base_path + "/model";
-        std::string param_path = base_path + "/params";
-        quantificate_combined_float32(model_path, param_path, combined_min_dir, quantification_fold);
-        return 0;
-    }
-
-    return -1;
-}
diff --git a/mobile/tools/quantification/scripts/run.py b/mobile/tools/quantification/scripts/run.py
deleted file mode 100644
index bf34441470..0000000000
--- a/mobile/tools/quantification/scripts/run.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "model"
-checked_model_path = "quantification_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.1
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-quantification = True
-quantification_fold = int(sys.argv[1])
-architecture = "arm-v7a"
-# architecture = "arm-v8a"
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    # print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状,在model和params中一致,并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if not quantification:
-        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params,加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i {}/model -o {}/model.ml".format(line, checked_model_path, checked_model_path))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i {}/params -o {}/params.ml".format(line, checked_model_path, checked_model_path))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状,放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var,全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    if len(output_var_filter) > 0:
-        for fetch_name in fetch_names:
-            output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    # for line in lines:
-    #     print(line)
-    for line in lines:
-        if line.startswith("auto-test-debug"):
-            print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    fetch_diff = 0.0
-    fetch_count = 0
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        for i in range(len(values1)):
-            v1 = values1[i]
-            v2 = values2[len(shape) + i]
-            fetch_diff += abs(v1 - v2)
-            fetch_count += 1
-    if fetch_count != 0:
-        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
-        print(fetch_diff / fetch_count)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("outputs are incorrect", 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-        if not fuse and not mem_opt:
-            pp_yellow("checking individual ops : ", 1)
-            error_index = None
-            error_values1 = None
-            error_values2 = None
-            checked_names = []
-            fetch_names = []
-            for fetch in fetches:
-                fetch_names.append(fetch.name)
-            for index in op_cache:
-                op_output_var_name, op = op_cache[index]
-                if mem_opt:
-                    found_in_fetch = False
-                    for fetch in fetches:
-                        if op_output_var_name == fetch.name:
-                            found_in_fetch = True
-                            break
-                    if not found_in_fetch:
-                        continue
-                if not op_output_var_name in output_var_cache:
-                    continue
-                if not op_output_var_name in mobile_var_cache:
-                    continue
-                if fuse or mem_opt:
-                    if op_output_var_name not in fetch_names:
-                        continue
-                values1 = output_var_cache[op_output_var_name]
-                values2 = mobile_var_cache[op_output_var_name]
-                shape = get_var_shape(op_output_var_name) if check_shape else []
-                if len(values1) + len(shape) != len(values2):
-                    error_index = index
-                for i in range(len(shape)):
-                    v1 = shape[i]
-                    v2 = values2[i]
-                    if v1 != v2:
-                        error_index = index
-                        break
-                if error_index == None:
-                    for i in range(len(values1)):
-                        v1 = values1[i]
-                        v2 = values2[len(shape) + i]
-                        if abs(v1 - v2) > diff_threshold:
-                            error_index = index
-                            break
-                checked_names.append(op_output_var_name)
-                if error_index != None:
-                    error_values1 = values1
-                    error_values2 = values2
-                    break
-            if error_index == None:
-                for name in fetch_names:
-                    if name not in checked_names:
-                        error_index = -1
-                        break
-            if error_index == None:
-                pp_green("outputs are all correct", 1)
-            elif error_index == -1:
-                pp_red("outputs are missing")
-            else:
-                error_values1 = np.array(error_values1)
-                error_values2 = np.array(error_values2)
-                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-                pp_red("fluid results are : ", 1)
-                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-                pp_yellow("paddle mobile results are : ", 1)
-                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    sh("rm -rf checked_model")
-    sh("cp -r {} checked_model".format(checked_model_path))
-    push("checked_model")
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
-    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
-    push(mobile_src_root + "/test/build/test-net")
-    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
-    args = str(len(last_feed_var_shape))
-    for dim in last_feed_var_shape:
-        args += " " + str(dim)
-    if is_lod:
-        args += " 1"
-        args += " " + str(len(last_feed_var_lod))
-        for dim in last_feed_var_lod:
-            args += " " + str(dim)
-    else:
-        args += " 0"
-    args += " " + str(len(output_var_cache))
-    args += " " + str(1 if is_sample_step else 0)
-    if is_sample_step:
-        args += " " + str(sample_step)
-    else:
-        args += " " + str(sample_num)
-    for var_name in output_var_cache.keys():
-        args += " " + var_name
-    args += " " + str(1 if check_shape else 0)
-    # if not fast_check:
-    #     check_mobile_results(args, False, False)
-    #     check_mobile_results(args, False, True)
-    # check_mobile_results(args, True, False)
-    check_mobile_results(args, True, True)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/quantification/src/block_desc_local.cpp b/mobile/tools/quantification/src/block_desc_local.cpp
deleted file mode 100644
index 8ad1982c05..0000000000
--- a/mobile/tools/quantification/src/block_desc_local.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-#include "src/block_desc_local.h"
-#include 
-#include 
-#include 
-
-#include "src/framework.pb-c.h"
-
-std::vector>
-BlockDesc::Vars() const {
-  return vars_;
-}
-
-BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
-    : index_(desc->idx), parent_index_(desc->idx) {
-  for (int i = 0; i < desc->n_vars; ++i) {
-    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
-    vars_.emplace_back(std::shared_ptr(
-        new paddle_mobile::framework::VarDesc(var_desc)));
-  }
-
-  std::sort(vars_.begin(), vars_.end(),
-            [](std::shared_ptr left,
-               std::shared_ptr right) {
-              return left->Name() < right->Name();
-            });
-
-  //        for (int j = 0; j < desc->n_ops; ++j) {
-  //            PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
-  //            ops_.emplace_back(new OpDesc(op_desc));
-  //        }
-}
diff --git a/mobile/tools/quantification/src/block_desc_local.h b/mobile/tools/quantification/src/block_desc_local.h
deleted file mode 100644
index 2ee8132af7..0000000000
--- a/mobile/tools/quantification/src/block_desc_local.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
-#define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
-
-#include 
-#include 
-#include "src/var_desc.h"
-
-class BlockDesc {
- public:
-  friend class Node;
-  friend class ProgramOptimize;
-  BlockDesc() {}
-  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
-
-  const int &ID() const { return index_; }
-
-  const bool &MultiThread() const { return multi_thread_; }
-
-  const int &Parent() const { return parent_index_; }
-
-  bool operator==(const BlockDesc &in_block) const {
-    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
-  }
-
-  bool operator<(const BlockDesc &in_block) const {
-    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
-  }
-
-  std::vector> Vars() const;
-
- private:
-  int index_;
-  bool multi_thread_;
-  int parent_index_;
-  std::vector> vars_;
-};
-
-#endif  // TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
diff --git a/mobile/tools/quantification/src/enforce.h b/mobile/tools/quantification/src/enforce.h
deleted file mode 100644
index 51d2110e32..0000000000
--- a/mobile/tools/quantification/src/enforce.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef ENABLE_EXCEPTION
-#include 
-#include 
-#include 
-
-#endif
-
-namespace paddle_mobile {
-
-#ifdef ENABLE_EXCEPTION
-struct PaddleMobileException : public std::exception {
-  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
-  std::string message;
-
-  PaddleMobileException(const char *header, const char *detail,
-                        const char *file, const int line) {
-    char buffer[1500];
-    snprintf(buffer, sizeof(buffer),
-             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
-             exception_prefix.c_str(), header, file, line, detail);
-    message = std::string(buffer);
-  }
-  const char *what() const noexcept { return message.c_str(); }
-};
-
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
-  {                                                                        \
-    char buffer[1000];                                                     \
-    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
-    std::string detail(buffer);                                            \
-    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
-                                               __FILE__, __LINE__);        \
-  }
-
-#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
-  {                                                                           \
-    if (stat) {                                                               \
-    } else {                                                                  \
-      char buffer[1000];                                                      \
-      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
-      std::string detail(buffer);                                             \
-      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
-                                                 buffer, __FILE__, __LINE__); \
-    }                                                                         \
-  }
-#else
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ENFORCE(stat, ...)
-#endif
-
-}  // namespace paddle_mobile
diff --git a/mobile/tools/quantification/src/framework.pb-c.c b/mobile/tools/quantification/src/framework.pb-c.c
deleted file mode 100644
index aed0a6c9c0..0000000000
--- a/mobile/tools/quantification/src/framework.pb-c.c
+++ /dev/null
@@ -1,1403 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-/* Do not generate deprecated warnings for self */
-#ifndef PROTOBUF_C__NO_DEPRECATED
-#define PROTOBUF_C__NO_DEPRECATED
-#endif
-
-#include "framework.pb-c.h"
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
-                                                 size_t len,
-                                                 const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message) {
-  static const PaddleMobile__Framework__Proto__OpProto init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpProto *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__op_proto__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message) {
-  static const PaddleMobile__Framework__Proto__VarType__Tuple init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message) {
-  static const PaddleMobile__Framework__Proto__VarType init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarType *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__var_type__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__var_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message) {
-  static const PaddleMobile__Framework__Proto__BlockDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__BlockDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__block_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  static const PaddleMobile__Framework__Proto__ProgramDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__ProgramDesc *)
-      protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__program_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
-                     has_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
-        8,  /* field[8] = b */
-        10, /* field[10] = block_idx */
-        9,  /* field[9] = bools */
-        3,  /* field[3] = f */
-        6,  /* field[6] = floats */
-        2,  /* field[2] = i */
-        5,  /* field[5] = ints */
-        11, /* field[11] = l */
-        0,  /* field[0] = name */
-        4,  /* field[4] = s */
-        7,  /* field[7] = strings */
-        1,  /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 12}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpDesc__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        12,
-        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
-        2,
-        paddle_mobile__framework__proto__op_desc__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = {
-        {
-            "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = {
-        1, /* field[1] = arguments */
-        0, /* field[0] = parameter */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpDesc__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Var),
-        2,
-        paddle_mobile__framework__proto__op_desc__var__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_desc__is_target__default_value = 0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__field_descriptors[5] = {
-        {
-            "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs),
-            &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL,
-            &paddle_mobile__framework__proto__op_desc__is_target__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        0, /* field[0] = inputs */
-        4, /* field[4] = is_target */
-        1, /* field[1] = outputs */
-        2, /* field[2] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                      {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc",
-        "OpDesc",
-        "PaddleMobile__Framework__Proto__OpDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc),
-        5,
-        paddle_mobile__framework__proto__op_desc__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__duplicable__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__intermediate__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_duplicable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_intermediate),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     intermediate),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_dispensable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
-        1, /* field[1] = comment */
-        4, /* field[4] = dispensable */
-        2, /* field[2] = duplicable */
-        3, /* field[3] = intermediate */
-        0, /* field[0] = name */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpProto__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
-        5,
-        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__attr__generated__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr,
-                     has_generated),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__attr__generated__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = {
-        2, /* field[2] = comment */
-        3, /* field[3] = generated */
-        0, /* field[0] = name */
-        1, /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = {
-        {1, 0}, {0, 4}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpProto__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Attr),
-        4,
-        paddle_mobile__framework__proto__op_proto__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__field_descriptors[5] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, inputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, outputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, attrs),
-            &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        4, /* field[4] = comment */
-        1, /* field[1] = inputs */
-        2, /* field[2] = outputs */
-        0, /* field[0] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto",
-        "OpProto",
-        "PaddleMobile__Framework__Proto__OpProto",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto),
-        5,
-        paddle_mobile__framework__proto__op_proto__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         n_dims),
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         dims),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = data_type */
-            1, /* field[1] = dims */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.TensorDesc",
-        "TensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__TensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         has_lod_level),
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorDesc",
-        "LoDTensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    has_lod_level),
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc",
-        "LoDTensorArrayDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = {
-        {
-            "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     n_lod_tensor),
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = lod_tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ReaderDesc",
-        "ReaderDesc",
-        "PaddleMobile__Framework__Proto__VarType__ReaderDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc),
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__reader_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         capacity),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = capacity */
-            0, /* field[0] = data_type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 +
-                                                                           1] =
-        {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ChannelDesc",
-        "ChannelDesc",
-        "PaddleMobile__Framework__Proto__VarType__ChannelDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__channel_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__channel_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = {
-        {
-            "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM,
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     n_element_type),
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     element_type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] =
-        {
-            0, /* field[0] = element_type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Tuple",
-        "Tuple",
-        "PaddleMobile__Framework__Proto__VarType__Tuple",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__Tuple),
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tuple__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] =
-        {
-            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
-             0},
-            {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16",
-             1},
-            {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32",
-             2},
-            {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64",
-             3},
-            {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16",
-             4},
-            {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32",
-             5},
-            {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64",
-             6},
-            {"LOD_TENSOR",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7},
-            {"SELECTED_ROWS",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS",
-             8},
-            {"FEED_MINIBATCH",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH",
-             9},
-            {"FETCH_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10},
-            {"STEP_SCOPES",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES",
-             11},
-            {"LOD_RANK_TABLE",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE",
-             12},
-            {"LOD_TENSOR_ARRAY",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_"
-             "ARRAY",
-             13},
-            {"PLACE_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14},
-            {"READER",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15},
-            {"CHANNEL",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16},
-            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
-            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
-             18},
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
-                                                                       {0, 19}};
-static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = {
-        {"BOOL", 0},
-        {"CHANNEL", 16},
-        {"FEED_MINIBATCH", 9},
-        {"FETCH_LIST", 10},
-        {"FP16", 4},
-        {"FP32", 5},
-        {"FP64", 6},
-        {"INT16", 1},
-        {"INT32", 2},
-        {"INT64", 3},
-        {"LOD_RANK_TABLE", 12},
-        {"LOD_TENSOR", 7},
-        {"LOD_TENSOR_ARRAY", 13},
-        {"PLACE_LIST", 14},
-        {"RAW", 17},
-        {"READER", 15},
-        {"SELECTED_ROWS", 8},
-        {"STEP_SCOPES", 11},
-        {"TUPLE", 18},
-};
-const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Type",
-        "Type",
-        "PaddleMobile__Framework__Proto__VarType__Type",
-        "paddle_mobile.framework.proto",
-        19,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
-        19,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__field_descriptors[7] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows),
-            &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, reader),
-            &paddle_mobile__framework__proto__var_type__reader_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, channel),
-            &paddle_mobile__framework__proto__var_type__channel_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tuple),
-            &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__field_indices_by_name[] = {
-        5, /* field[5] = channel */
-        2, /* field[2] = lod_tensor */
-        4, /* field[4] = reader */
-        1, /* field[1] = selected_rows */
-        3, /* field[3] = tensor_array */
-        6, /* field[6] = tuple */
-        0, /* field[0] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 7}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType",
-        "VarType",
-        "PaddleMobile__Framework__Proto__VarType",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType),
-        7,
-        paddle_mobile__framework__proto__var_type__field_descriptors,
-        paddle_mobile__framework__proto__var_type__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__var_desc__persistable__default_value = 0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_desc__field_descriptors[3] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, type),
-            &paddle_mobile__framework__proto__var_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable),
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable),
-            NULL,
-            &paddle_mobile__framework__proto__var_desc__persistable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = {
-        0, /* field[0] = name */
-        2, /* field[2] = persistable */
-        1, /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 3}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarDesc",
-        "VarDesc",
-        "PaddleMobile__Framework__Proto__VarDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarDesc),
-        3,
-        paddle_mobile__framework__proto__var_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value =
-        -1;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__block_desc__field_descriptors[5] = {
-        {
-            "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars),
-            &paddle_mobile__framework__proto__var_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops),
-            &paddle_mobile__framework__proto__op_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     has_forward_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     forward_block_idx),
-            NULL,
-            &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = {
-        4, /* field[4] = forward_block_idx */
-        0, /* field[0] = idx */
-        3, /* field[3] = ops */
-        1, /* field[1] = parent_idx */
-        2, /* field[2] = vars */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.BlockDesc",
-        "BlockDesc",
-        "PaddleMobile__Framework__Proto__BlockDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__BlockDesc),
-        5,
-        paddle_mobile__framework__proto__block_desc__field_descriptors,
-        paddle_mobile__framework__proto__block_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__block_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__program_desc__field_descriptors[1] = {
-        {
-            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks),
-            &paddle_mobile__framework__proto__block_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
-        0, /* field[0] = blocks */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.ProgramDesc",
-        "ProgramDesc",
-        "PaddleMobile__Framework__Proto__ProgramDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
-        1,
-        paddle_mobile__framework__proto__program_desc__field_descriptors,
-        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__program_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__program_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = {
-        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
-        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
-        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
-        {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3},
-        {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4},
-        {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5},
-        {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6},
-        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
-        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
-        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 10}};
-static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = {
-        {"BLOCK", 8},  {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1},
-        {"FLOATS", 4}, {"INT", 0},     {"INTS", 3},     {"LONG", 9},
-        {"STRING", 2}, {"STRINGS", 5},
-};
-const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.AttrType",
-        "AttrType",
-        "PaddleMobile__Framework__Proto__AttrType",
-        "paddle_mobile.framework.proto",
-        10,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        10,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__attr_type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
diff --git a/mobile/tools/quantification/src/framework.pb-c.h b/mobile/tools/quantification/src/framework.pb-c.h
deleted file mode 100644
index 3d63bad76a..0000000000
--- a/mobile/tools/quantification/src/framework.pb-c.h
+++ /dev/null
@@ -1,579 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-#ifndef PROTOBUF_C_framework_2eproto__INCLUDED
-#define PROTOBUF_C_framework_2eproto__INCLUDED
-
-#include "protobuf-c.h"
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if PROTOBUF_C_VERSION_NUMBER < 1000000
-# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION
-# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
-#endif
-
-typedef struct _PaddleMobile__Framework__Proto__OpDesc
-    PaddleMobile__Framework__Proto__OpDesc;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
-    PaddleMobile__Framework__Proto__OpDesc__Attr;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var
-    PaddleMobile__Framework__Proto__OpDesc__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto
-    PaddleMobile__Framework__Proto__OpProto;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Var
-    PaddleMobile__Framework__Proto__OpProto__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr
-    PaddleMobile__Framework__Proto__OpProto__Attr;
-typedef struct _PaddleMobile__Framework__Proto__VarType
-    PaddleMobile__Framework__Proto__VarType;
-typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc
-    PaddleMobile__Framework__Proto__VarType__TensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple
-    PaddleMobile__Framework__Proto__VarType__Tuple;
-typedef struct _PaddleMobile__Framework__Proto__VarDesc
-    PaddleMobile__Framework__Proto__VarDesc;
-typedef struct _PaddleMobile__Framework__Proto__BlockDesc
-    PaddleMobile__Framework__Proto__BlockDesc;
-typedef struct _PaddleMobile__Framework__Proto__ProgramDesc
-    PaddleMobile__Framework__Proto__ProgramDesc;
-
-/* --- enums --- */
-
-typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
-  /*
-   * Pod Types
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
-  /*
-   * Other types that may need additional descriptions
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
-  /*
-   * Any runtime decided variable type is raw
-   * raw variables should manage their own allocations
-   * in operators like nccl_op
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE =
-      18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE)
-} PaddleMobile__Framework__Proto__VarType__Type;
-typedef enum _PaddleMobile__Framework__Proto__AttrType {
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG =
-      9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
-} PaddleMobile__Framework__Proto__AttrType;
-
-/* --- messages --- */
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  protobuf_c_boolean has_i;
-  int32_t i;
-  protobuf_c_boolean has_f;
-  float f;
-  char *s;
-  size_t n_ints;
-  int32_t *ints;
-  size_t n_floats;
-  float *floats;
-  size_t n_strings;
-  char **strings;
-  protobuf_c_boolean has_b;
-  protobuf_c_boolean b;
-  size_t n_bools;
-  protobuf_c_boolean *bools;
-  protobuf_c_boolean has_block_idx;
-  int32_t block_idx;
-  protobuf_c_boolean has_l;
-  int64_t l;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0                   \
-  }
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Var {
-  ProtobufCMessage base;
-  char *parameter;
-  size_t n_arguments;
-  char **arguments;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__op_desc__var__descriptor) \
-    , NULL, 0, NULL                                                 \
-  }
-
-/*
- * OpDesc describes an instance of a C++ framework::OperatorBase
- * derived class type.
- */
-struct _PaddleMobile__Framework__Proto__OpDesc {
-  ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpDesc__Attr **attrs;
-  protobuf_c_boolean has_is_target;
-  protobuf_c_boolean is_target;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT         \
-  {                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                   \
-        &paddle_mobile__framework__proto__op_desc__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0                    \
-  }
-
-/*
- * VarProto describes the C++ type framework::Variable.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Var {
-  ProtobufCMessage base;
-  char *name;
-  char *comment;
-  protobuf_c_boolean has_duplicable;
-  protobuf_c_boolean duplicable;
-  protobuf_c_boolean has_intermediate;
-  protobuf_c_boolean intermediate;
-  protobuf_c_boolean has_dispensable;
-  protobuf_c_boolean dispensable;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
-  {                                                                  \
-    PROTOBUF_C_MESSAGE_INIT(                                         \
-        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
-    , NULL, NULL, 0, 0, 0, 0, 0, 0                                   \
-  }
-
-/*
- * AttrProto describes the C++ type Attribute.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Attr {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  char *comment;
-  /*
-   * If that attribute is generated, it means the Paddle third
-   * language binding has responsibility to fill that
-   * attribute. End-User should not set that attribute.
-   */
-  protobuf_c_boolean has_generated;
-  protobuf_c_boolean generated;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT           \
-  {                                                                     \
-    PROTOBUF_C_MESSAGE_INIT(                                            \
-        &paddle_mobile__framework__proto__op_proto__attr__descriptor)   \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \
-  }
-
-/*
- * OpProto describes a C++ framework::OperatorBase derived class.
- */
-struct _PaddleMobile__Framework__Proto__OpProto {
-  ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpProto__Attr **attrs;
-  char *comment;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__op_proto__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, NULL                     \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__TensorDesc {
-  ProtobufCMessage base;
-  /*
-   * Should only be PODType. Is enforced in C++
-   */
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  /*
-   * [UNK, 640, 480] is saved as [-1, 640, 480]
-   */
-  size_t n_dims;
-  int64_t *dims;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL         \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT         \
-  {                                                                              \
-    PROTOBUF_C_MESSAGE_INIT(                                                     \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \
-    , NULL, 0, 0                                                                 \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT         \
-  {                                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                                           \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \
-    , NULL, 0, 0                                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc {
-  ProtobufCMessage base;
-  size_t n_lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \
-    , 0, NULL                                                                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  int64_t capacity;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT         \
-  {                                                                           \
-    PROTOBUF_C_MESSAGE_INIT(                                                  \
-        &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__Tuple {
-  ProtobufCMessage base;
-  size_t n_element_type;
-  PaddleMobile__Framework__Proto__VarType__Type *element_type;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT         \
-  {                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                           \
-        &paddle_mobile__framework__proto__var_type__tuple__descriptor) \
-    , 0, NULL                                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type type;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array;
-  PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader;
-  PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel;
-  PaddleMobile__Framework__Proto__VarType__Tuple *tuple;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT                        \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__var_type__descriptor)                \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \
-        NULL, NULL, NULL                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarDesc {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__VarType *type;
-  protobuf_c_boolean has_persistable;
-  protobuf_c_boolean persistable;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__var_desc__descriptor) \
-    , NULL, NULL, 0, 0                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__BlockDesc {
-  ProtobufCMessage base;
-  int32_t idx;
-  int32_t parent_idx;
-  size_t n_vars;
-  PaddleMobile__Framework__Proto__VarDesc **vars;
-  size_t n_ops;
-  PaddleMobile__Framework__Proto__OpDesc **ops;
-  protobuf_c_boolean has_forward_block_idx;
-  int32_t forward_block_idx;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT         \
-  {                                                               \
-    PROTOBUF_C_MESSAGE_INIT(                                      \
-        &paddle_mobile__framework__proto__block_desc__descriptor) \
-    , 0, 0, 0, NULL, 0, NULL, 0, -1                               \
-  }
-
-/*
- * Please refer to
- * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
- * for more details.
- * TODO(panyx0718): A model can have multiple programs. Need a
- * way to distinguish them. Maybe ID or name?
- */
-struct _PaddleMobile__Framework__Proto__ProgramDesc {
-  ProtobufCMessage base;
-  size_t n_blocks;
-  PaddleMobile__Framework__Proto__BlockDesc **blocks;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__program_desc__descriptor) \
-    , 0, NULL                                                       \
-  }
-
-/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
-/* PaddleMobile__Framework__Proto__OpDesc__Var methods */
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message);
-/* PaddleMobile__Framework__Proto__OpDesc methods */
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message);
-
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message);
-
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
-                                                 size_t len,
-                                                 const uint8_t *data);
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__OpProto__Var methods */
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message);
-/* PaddleMobile__Framework__Proto__OpProto__Attr methods */
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message);
-/* PaddleMobile__Framework__Proto__OpProto methods */
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message);
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message);
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__Tuple methods */
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message);
-/* PaddleMobile__Framework__Proto__VarType methods */
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message);
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message);
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarDesc methods */
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message);
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message);
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__BlockDesc methods */
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message);
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message);
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__ProgramDesc methods */
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message);
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message);
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    ProtobufCAllocator *allocator);
-/* --- per-message closures --- */
-
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__TensorDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message,
-    void *closure_data);
-typedef void (
-    *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__Tuple *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)(
-    const PaddleMobile__Framework__Proto__VarType *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)(
-    const PaddleMobile__Framework__Proto__BlockDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message,
-    void *closure_data);
-
-/* --- services --- */
-
-/* --- descriptors --- */
-
-extern const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor;
-extern const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor;
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */
diff --git a/mobile/tools/quantification/src/program_desc.cpp b/mobile/tools/quantification/src/program_desc.cpp
deleted file mode 100644
index 4f9984832a..0000000000
--- a/mobile/tools/quantification/src/program_desc.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#include "src/program_desc.h"
-#include 
-
-ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
-  for (int i = 0; i < desc->n_blocks; ++i) {
-    blocks_.emplace_back(std::make_shared(desc->blocks[i]));
-  }
-}
-
-const std::vector> ProgramDesc::Blocks() {
-  return blocks_;
-}
diff --git a/mobile/tools/quantification/src/program_desc.h b/mobile/tools/quantification/src/program_desc.h
deleted file mode 100644
index 60a0f757b0..0000000000
--- a/mobile/tools/quantification/src/program_desc.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#ifndef TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
-#define TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
-
-#include 
-#include 
-#include "src/block_desc_local.h"
-#include "src/framework.pb-c.h"
-
-class ProgramDesc {
- public:
-  //    friend class Node;
-  //
-  //    friend class ProgramOptimize;
-
-  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
-
-  const std::vector> Blocks();
-
- private:
-  std::vector> blocks_;
-};
-
-#endif  // TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
diff --git a/mobile/tools/quantification/src/protobuf-c.c b/mobile/tools/quantification/src/protobuf-c.c
deleted file mode 100644
index 1092e3f78b..0000000000
--- a/mobile/tools/quantification/src/protobuf-c.c
+++ /dev/null
@@ -1,2098 +0,0 @@
-/*
- * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * Support library for `protoc-c` generated code.
- *
- * This file implements the public API used by the code generated
- * by `protoc-c`.
- *
- * \authors Dave Benson and the protobuf-c authors
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- */
-
-/**
- * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math
- * even on 64-bit platforms (uint64_size, uint64_pack, parse_uint64).
- *
- * \todo Use size_t consistently.
- */
-
-#include  /* for malloc, free */
-#include  /* for strcmp, strlen, memcpy, memmove, memset */
-
-#include "protobuf-c.h"
-
-#define TRUE 1
-#define FALSE 0
-
-#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0)
-
-/* Workaround for Microsoft compilers. */
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-/**
- * \defgroup internal Internal functions and macros
- *
- * These are not exported by the library but are useful to developers working
- * on `libprotobuf-c` itself.
- */
-
-/**
- * \defgroup macros Utility macros for manipulating structures
- *
- * Macros and constants used to manipulate the base "classes" generated by
- * `protobuf-c`. They also define limits and check correctness.
- *
- * \ingroup internal
- * @{
- */
-
-/** The maximum length of a 64-bit integer in varint encoding. */
-#define MAX_UINT64_ENCODED_SIZE 10
-
-#ifndef PROTOBUF_C_UNPACK_ERROR
-#define PROTOBUF_C_UNPACK_ERROR(...)
-#endif
-
-const char protobuf_c_empty_string[] = "";
-
-/**
- * Internal `ProtobufCMessage` manipulation macro.
- *
- * Base macro for manipulating a `ProtobufCMessage`. Used by STRUCT_MEMBER() and
- * STRUCT_MEMBER_PTR().
- */
-#define STRUCT_MEMBER_P(struct_p, struct_offset) \
-  ((void *)((uint8_t *)(struct_p) + (struct_offset)))
-
-/**
- * Return field in a `ProtobufCMessage` based on offset.
- *
- * Take a pointer to a `ProtobufCMessage` and find the field at the offset.
- * Cast it to the passed type.
- */
-#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \
-  (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/**
- * Return field in a `ProtobufCMessage` based on offset.
- *
- * Take a pointer to a `ProtobufCMessage` and find the field at the offset. Cast
- * it to a pointer to the passed type.
- */
-#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \
-  ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/* Assertions for magic numbers. */
-
-#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE(message) \
-  ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor)
-
-#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC)
-
-/**@}*/
-
-/* --- version --- */
-
-const char *protobuf_c_version(void) { return PROTOBUF_C_VERSION; }
-
-uint32_t protobuf_c_version_number(void) { return PROTOBUF_C_VERSION_NUMBER; }
-
-/* --- allocator --- */
-
-static void *system_alloc(void *allocator_data, size_t size) {
-  return malloc(size);
-}
-
-static void system_free(void *allocator_data, void *data) { free(data); }
-
-static inline void *do_alloc(ProtobufCAllocator *allocator, size_t size) {
-  return allocator->alloc(allocator->allocator_data, size);
-}
-
-static inline void do_free(ProtobufCAllocator *allocator, void *data) {
-  if (data != NULL) allocator->free(allocator->allocator_data, data);
-}
-
-/*
- * This allocator uses the system's malloc() and free(). It is the default
- * allocator used if NULL is passed as the ProtobufCAllocator to an exported
- * function.
- */
-static ProtobufCAllocator protobuf_c__allocator = {
-    .alloc = &system_alloc,
-    .free = &system_free,
-    .allocator_data = NULL,
-};
-
-/* === buffer-simple === */
-
-void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
-                                     const uint8_t *data) {
-  ProtobufCBufferSimple *simp = (ProtobufCBufferSimple *)buffer;
-  size_t new_len = simp->len + len;
-
-  if (new_len > simp->alloced) {
-    ProtobufCAllocator *allocator = simp->allocator;
-    size_t new_alloced = simp->alloced * 2;
-    uint8_t *new_data;
-
-    if (allocator == NULL) allocator = &protobuf_c__allocator;
-    while (new_alloced < new_len) new_alloced += new_alloced;
-    new_data = do_alloc(allocator, new_alloced);
-    if (!new_data) return;
-    memcpy(new_data, simp->data, simp->len);
-    if (simp->must_free_data)
-      do_free(allocator, simp->data);
-    else
-      simp->must_free_data = TRUE;
-    simp->data = new_data;
-    simp->alloced = new_alloced;
-  }
-  memcpy(simp->data + simp->len, data, len);
-  simp->len = new_len;
-}
-
-/**
- * \defgroup packedsz protobuf_c_message_get_packed_size() implementation
- *
- * Routines mainly used by protobuf_c_message_get_packed_size().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Return the number of bytes required to store the tag for the field. Includes
- * 3 bits for the wire-type, and a single bit that denotes the end-of-tag.
- *
- * \param number
- *      Field tag to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t get_tag_size(uint32_t number) {
-  if (number < (1UL << 4)) {
-    return 1;
-  } else if (number < (1UL << 11)) {
-    return 2;
-  } else if (number < (1UL << 18)) {
-    return 3;
-  } else if (number < (1UL << 25)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length unsigned
- * 32-bit integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint32_size(uint32_t v) {
-  if (v < (1UL << 7)) {
-    return 1;
-  } else if (v < (1UL << 14)) {
-    return 2;
-  } else if (v < (1UL << 21)) {
-    return 3;
-  } else if (v < (1UL << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length signed 32-bit
- * integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t int32_size(int32_t v) {
-  if (v < 0) {
-    return 10;
-  } else if (v < (1L << 7)) {
-    return 1;
-  } else if (v < (1L << 14)) {
-    return 2;
-  } else if (v < (1L << 21)) {
-    return 3;
-  } else if (v < (1L << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint32_t zigzag32(int32_t v) {
-  if (v < 0)
-    return (-(uint32_t)v) * 2 - 1;
-  else
-    return (uint32_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 32-bit integer,
- * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); }
-
-/**
- * Return the number of bytes required to store a 64-bit unsigned integer in
- * base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint64_size(uint64_t v) {
-  uint32_t upper_v = (uint32_t)(v >> 32);
-
-  if (upper_v == 0) {
-    return uint32_size((uint32_t)v);
-  } else if (upper_v < (1UL << 3)) {
-    return 5;
-  } else if (upper_v < (1UL << 10)) {
-    return 6;
-  } else if (upper_v < (1UL << 17)) {
-    return 7;
-  } else if (upper_v < (1UL << 24)) {
-    return 8;
-  } else if (upper_v < (1UL << 31)) {
-    return 9;
-  } else {
-    return 10;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint64_t zigzag64(int64_t v) {
-  if (v < 0)
-    return (-(uint64_t)v) * 2 - 1;
-  else
-    return (uint64_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 64-bit integer,
- * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); }
-
-/**
- * Calculate the serialized size of a single required message field, including
- * the space needed by the preceding tag.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t required_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  size_t rv = get_tag_size(field->id);
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      return rv + sint32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      return rv + int32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_UINT32:
-      return rv + uint32_size(*(const uint32_t *)member);
-    case PROTOBUF_C_TYPE_SINT64:
-      return rv + sint64_size(*(const int64_t *)member);
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      return rv + uint64_size(*(const uint64_t *)member);
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return rv + 1;
-    case PROTOBUF_C_TYPE_FLOAT:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_STRING: {
-      const char *str = *(char *const *)member;
-      size_t len = str ? strlen(str) : 0;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      size_t len = ((const ProtobufCBinaryData *)member)->len;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      const ProtobufCMessage *msg = *(ProtobufCMessage *const *)member;
-      size_t subrv = msg ? protobuf_c_message_get_packed_size(msg) : 0;
-      return rv + uint32_size(subrv) + subrv;
-    }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Calculate the serialized size of a single oneof message field, including
- * the space needed by the preceding tag. Returns 0 if the oneof field isn't
- * selected or is not set.
- *
- * \param field
- *      Field descriptor for member.
- * \param oneof_case
- *      Enum value that selects the field in the oneof.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t oneof_field_get_packed_size(const ProtobufCFieldDescriptor *field,
-                                          uint32_t oneof_case,
-                                          const void *member) {
-  if (oneof_case != field->id) {
-    return 0;
-  }
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  }
-  return required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of a single optional message field, including
- * the space needed by the preceding tag. Returns 0 if the optional field isn't
- * set.
- *
- * \param field
- *      Field descriptor for member.
- * \param has
- *      True if the field exists, false if not.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t optional_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const protobuf_c_boolean has,
-    const void *member) {
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  } else {
-    if (!has) return 0;
-  }
-  return required_field_get_packed_size(field, member);
-}
-
-static protobuf_c_boolean field_is_zeroish(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  protobuf_c_boolean ret = FALSE;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_BOOL:
-      ret = (0 == *(const protobuf_c_boolean *)member);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      ret = (0 == *(const uint32_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      ret = (0 == *(const uint64_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_FLOAT:
-      ret = (0 == *(const float *)member);
-      break;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      ret = (0 == *(const double *)member);
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      ret = (NULL == *(const char *const *)member) ||
-            ('\0' == **(const char *const *)member);
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      ret = (NULL == *(const void *const *)member);
-      break;
-    default:
-      ret = TRUE;
-      break;
-  }
-
-  return ret;
-}
-
-/**
- * Calculate the serialized size of a single unlabeled message field, including
- * the space needed by the preceding tag. Returns 0 if the field isn't set or
- * if it is set to a "zeroish" value (null pointer or 0 for numerical values).
- * Unlabeled fields are supported only in proto3.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t unlabeled_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  if (field_is_zeroish(field, member)) return 0;
-  return required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of repeated message fields, which may consist
- * of any number of values (including 0). Includes the space needed by the
- * preceding tags (as needed).
- *
- * \param field
- *      Field descriptor for member.
- * \param count
- *      Number of repeated field members.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t repeated_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, size_t count, const void *member) {
-  size_t header_size;
-  size_t rv = 0;
-  unsigned i;
-  void *array = *(void *const *)member;
-
-  if (count == 0) return 0;
-  header_size = get_tag_size(field->id);
-  if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-      for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      rv += 4 * count;
-      break;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      rv += 8 * count;
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      rv += count;
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      for (i = 0; i < count; i++) {
-        size_t len = strlen(((char **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-      for (i = 0; i < count; i++) {
-        size_t len = ((ProtobufCBinaryData *)array)[i].len;
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_MESSAGE:
-      for (i = 0; i < count; i++) {
-        size_t len =
-            protobuf_c_message_get_packed_size(((ProtobufCMessage **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-  }
-
-  if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED))
-    header_size += uint32_size(rv);
-  return header_size + rv;
-}
-
-/**
- * Calculate the serialized size of an unknown field, i.e. one that is passed
- * through mostly uninterpreted. This is required for forward compatibility if
- * new fields are added to the message descriptor.
- *
- * \param field
- *      Unknown field type.
- * \return
- *      Number of bytes required.
- */
-static inline size_t unknown_field_get_packed_size(
-    const ProtobufCMessageUnknownField *field) {
-  return get_tag_size(field->tag) + field->len;
-}
-
-/**@}*/
-
-/*
- * Calculate the serialized size of the message.
- */
-size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message) {
-  unsigned i;
-  size_t rv = 0;
-
-  ASSERT_IS_MESSAGE(message);
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const ProtobufCFieldDescriptor *field = message->descriptor->fields + i;
-    const void *member = ((const char *)message) + field->offset;
-    const void *qmember = ((const char *)message) + field->quantifier_offset;
-
-    if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      rv += required_field_get_packed_size(field, member);
-    } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL ||
-                field->label == PROTOBUF_C_LABEL_NONE) &&
-               (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) {
-      rv += oneof_field_get_packed_size(field, *(const uint32_t *)qmember,
-                                        member);
-    } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) {
-      rv += optional_field_get_packed_size(
-          field, *(protobuf_c_boolean *)qmember, member);
-    } else if (field->label == PROTOBUF_C_LABEL_NONE) {
-      rv += unlabeled_field_get_packed_size(field, member);
-    } else {
-      rv += repeated_field_get_packed_size(field, *(const size_t *)qmember,
-                                           member);
-    }
-  }
-  for (i = 0; i < message->n_unknown_fields; i++)
-    rv += unknown_field_get_packed_size(&message->unknown_fields[i]);
-  return rv;
-}
-
-/**
- * \defgroup pack protobuf_c_message_pack() implementation
- *
- * Routines mainly used by protobuf_c_message_pack().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Pack an unsigned 32-bit integer in base-128 varint encoding and return the
- * number of bytes written, which must be 5 or less.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
-  unsigned rv = 0;
-
-  if (value >= 0x80) {
-    out[rv++] = value | 0x80;
-    value >>= 7;
-    if (value >= 0x80) {
-      out[rv++] = value | 0x80;
-      value >>= 7;
-      if (value >= 0x80) {
-        out[rv++] = value | 0x80;
-        value >>= 7;
-        if (value >= 0x80) {
-          out[rv++] = value | 0x80;
-          value >>= 7;
-        }
-      }
-    }
-  }
-  /* assert: value<128 */
-  out[rv++] = value;
-  return rv;
-}
-
-/**
- * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
- * number of bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t uint64_pack(uint64_t value, uint8_t *out) {
-  uint32_t hi = (uint32_t)(value >> 32);
-  uint32_t lo = (uint32_t)value;
-  unsigned rv;
-
-  if (hi == 0) return uint32_pack((uint32_t)lo, out);
-  out[0] = (lo) | 0x80;
-  out[1] = (lo >> 7) | 0x80;
-  out[2] = (lo >> 14) | 0x80;
-  out[3] = (lo >> 21) | 0x80;
-  if (hi < 8) {
-    out[4] = (hi << 4) | (lo >> 28);
-    return 5;
-  } else {
-    out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80;
-    hi >>= 3;
-  }
-  rv = 5;
-  while (hi >= 128) {
-    out[rv++] = hi | 0x80;
-    hi >>= 7;
-  }
-  out[rv++] = hi;
-  return rv;
-}
-
-/**
- * Pack a ProtobufCBinaryData and return the number of bytes written. The output
- * includes a length delimiter.
- *
- * \param bd
- *      ProtobufCBinaryData to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
-                                      uint8_t *out) {
-  size_t len = bd->len;
-  size_t rv = uint32_pack(len, out);
-  memcpy(out + rv, bd->data, len);
-  return rv + len;
-}
-
-/**
- * Pack a field tag.
- *
- * Wire-type will be added in required_field_pack().
- *
- * \todo Just call uint64_pack on 64-bit platforms.
- *
- * \param id
- *      Tag value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t tag_pack(uint32_t id, uint8_t *out) {
-  if (id < (1UL << (32 - 3)))
-    return uint32_pack(id << 3, out);
-  else
-    return uint64_pack(((uint64_t)id) << 3, out);
-}
-
-/**
- * Given a field type, return the in-memory size.
- *
- * \todo Implement as a table lookup.
- *
- * \param type
- *      Field type.
- * \return
- *      Size of the field.
- */
-static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-    case PROTOBUF_C_TYPE_ENUM:
-      return 4;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return sizeof(protobuf_c_boolean);
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      return sizeof(void *);
-    case PROTOBUF_C_TYPE_BYTES:
-      return sizeof(ProtobufCBinaryData);
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-static inline int int_range_lookup(unsigned n_ranges,
-                                   const ProtobufCIntRange *ranges, int value) {
-  unsigned n;
-  unsigned start;
-
-  if (n_ranges == 0) return -1;
-  start = 0;
-  n = n_ranges;
-  while (n > 1) {
-    unsigned mid = start + n / 2;
-
-    if (value < ranges[mid].start_value) {
-      n = mid - start;
-    } else if (value >=
-               ranges[mid].start_value +
-                   (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) {
-      unsigned new_start = mid + 1;
-      n = start + n - new_start;
-      start = new_start;
-    } else
-      return (value - ranges[mid].start_value) + ranges[mid].orig_index;
-  }
-  if (n > 0) {
-    unsigned start_orig_index = ranges[start].orig_index;
-    unsigned range_size = ranges[start + 1].orig_index - start_orig_index;
-
-    if (ranges[start].start_value <= value &&
-        value < (int)(ranges[start].start_value + range_size)) {
-      return (value - ranges[start].start_value) + start_orig_index;
-    }
-  }
-  return -1;
-}
-
-static size_t parse_tag_and_wiretype(size_t len, const uint8_t *data,
-                                     uint32_t *tag_out,
-                                     ProtobufCWireType *wiretype_out) {
-  unsigned max_rv = len > 5 ? 5 : len;
-  uint32_t tag = (data[0] & 0x7f) >> 3;
-  unsigned shift = 4;
-  unsigned rv;
-
-  *wiretype_out = data[0] & 7;
-  if ((data[0] & 0x80) == 0) {
-    *tag_out = tag;
-    return 1;
-  }
-  for (rv = 1; rv < max_rv; rv++) {
-    if (data[rv] & 0x80) {
-      tag |= (data[rv] & 0x7f) << shift;
-      shift += 7;
-    } else {
-      tag |= data[rv] << shift;
-      *tag_out = tag;
-      return rv + 1;
-    }
-  }
-  return 0; /* error: bad header */
-}
-
-/* sizeof(ScannedMember) must be <= (1UL< len) {
-    PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val);
-    return 0;
-  }
-  return hdr_len + val;
-}
-
-static size_t max_b128_numbers(size_t len, const uint8_t *data) {
-  size_t rv = 0;
-  while (len--)
-    if ((*data++ & 0x80) == 0) ++rv;
-  return rv;
-}
-
-/**@}*/
-
-/**
- * Merge earlier message into a latter message.
- *
- * For numeric types and strings, if the same value appears multiple
- * times, the parser accepts the last value it sees. For embedded
- * message fields, the parser merges multiple instances of the same
- * field. That is, all singular scalar fields in the latter instance
- * replace those in the former, singular embedded messages are merged,
- * and repeated fields are concatenated.
- *
- * The earlier message should be freed after calling this function, as
- * some of its fields may have been reused and changed to their default
- * values during the merge.
- */
-static protobuf_c_boolean merge_messages(ProtobufCMessage *earlier_msg,
-                                         ProtobufCMessage *latter_msg,
-                                         ProtobufCAllocator *allocator) {
-  unsigned i;
-  const ProtobufCFieldDescriptor *fields = latter_msg->descriptor->fields;
-  for (i = 0; i < latter_msg->descriptor->n_fields; i++) {
-    if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n_earlier =
-          STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset);
-      uint8_t **p_earlier =
-          STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset);
-      size_t *n_latter =
-          STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset);
-      uint8_t **p_latter =
-          STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset);
-
-      if (*n_earlier > 0) {
-        if (*n_latter > 0) {
-          /* Concatenate the repeated field */
-          size_t el_size = sizeof_elt_in_repeated_array(fields[i].type);
-          uint8_t *new_field;
-
-          new_field = do_alloc(allocator, (*n_earlier + *n_latter) * el_size);
-          if (!new_field) return FALSE;
-
-          memcpy(new_field, *p_earlier, *n_earlier * el_size);
-          memcpy(new_field + *n_earlier * el_size, *p_latter,
-                 *n_latter * el_size);
-
-          do_free(allocator, *p_latter);
-          do_free(allocator, *p_earlier);
-          *p_latter = new_field;
-          *n_latter = *n_earlier + *n_latter;
-        } else {
-          /* Zero copy the repeated field from the earlier message */
-          *n_latter = *n_earlier;
-          *p_latter = *p_earlier;
-        }
-        /* Make sure the field does not get double freed */
-        *n_earlier = 0;
-        *p_earlier = 0;
-      }
-    } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL ||
-               fields[i].label == PROTOBUF_C_LABEL_NONE) {
-      const ProtobufCFieldDescriptor *field;
-      uint32_t *earlier_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset);
-      uint32_t *latter_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset);
-      protobuf_c_boolean need_to_merge = FALSE;
-      void *earlier_elem;
-      void *latter_elem;
-      const void *def_val;
-
-      if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) {
-        if (*latter_case_p == 0) {
-          /* lookup correct oneof field */
-          int field_index = int_range_lookup(
-              latter_msg->descriptor->n_field_ranges,
-              latter_msg->descriptor->field_ranges, *earlier_case_p);
-          field = latter_msg->descriptor->fields + field_index;
-        } else {
-          /* Oneof is present in the latter message, move on */
-          continue;
-        }
-      } else {
-        field = &fields[i];
-      }
-
-      earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset);
-      latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset);
-      def_val = field->default_value;
-
-      switch (field->type) {
-        case PROTOBUF_C_TYPE_MESSAGE: {
-          ProtobufCMessage *em = *(ProtobufCMessage **)earlier_elem;
-          ProtobufCMessage *lm = *(ProtobufCMessage **)latter_elem;
-          if (em != NULL) {
-            if (lm != NULL) {
-              if (!merge_messages(em, lm, allocator)) return FALSE;
-              /* Already merged */
-              need_to_merge = FALSE;
-            } else {
-              /* Zero copy the message */
-              need_to_merge = TRUE;
-            }
-          }
-          break;
-        }
-        case PROTOBUF_C_TYPE_BYTES: {
-          uint8_t *e_data = ((ProtobufCBinaryData *)earlier_elem)->data;
-          uint8_t *l_data = ((ProtobufCBinaryData *)latter_elem)->data;
-          const ProtobufCBinaryData *d_bd = (ProtobufCBinaryData *)def_val;
-
-          need_to_merge =
-              (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) &&
-              (l_data == NULL || (d_bd != NULL && l_data == d_bd->data));
-          break;
-        }
-        case PROTOBUF_C_TYPE_STRING: {
-          char *e_str = *(char **)earlier_elem;
-          char *l_str = *(char **)latter_elem;
-          const char *d_str = def_val;
-
-          need_to_merge = e_str != d_str && l_str == d_str;
-          break;
-        }
-        default: {
-          /* Could be has field or case enum, the logic is
-           * equivalent, since 0 (FALSE) means not set for
-           * oneof */
-          need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0);
-          break;
-        }
-      }
-
-      if (need_to_merge) {
-        size_t el_size = sizeof_elt_in_repeated_array(field->type);
-        memcpy(latter_elem, earlier_elem, el_size);
-        /*
-         * Reset the element from the old message to 0
-         * to make sure earlier message deallocation
-         * doesn't corrupt zero-copied data in the new
-         * message, earlier message will be freed after
-         * this function is called anyway
-         */
-        memset(earlier_elem, 0, el_size);
-
-        if (field->quantifier_offset != 0) {
-          /* Set the has field or the case enum,
-           * if applicable */
-          *latter_case_p = *earlier_case_p;
-          *earlier_case_p = 0;
-        }
-      }
-    }
-  }
-  return TRUE;
-}
-
-/**
- * Count packed elements.
- *
- * Given a raw slab of packed-repeated values, determine the number of
- * elements. This function detects certain kinds of errors but not
- * others; the remaining error checking is done by
- * parse_packed_repeated_member().
- */
-static protobuf_c_boolean count_packed_elements(ProtobufCType type, size_t len,
-                                                const uint8_t *data,
-                                                size_t *count_out) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (len % 4 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 4 for fixed-length 32-bit types");
-        return FALSE;
-      }
-      *count_out = len / 4;
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (len % 8 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 8 for fixed-length 64-bit types");
-        return FALSE;
-      }
-      *count_out = len / 8;
-      return TRUE;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      *count_out = max_b128_numbers(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *count_out = len;
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-    default:
-      PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated",
-                              type);
-      return FALSE;
-  }
-}
-
-static inline uint32_t parse_uint32(unsigned len, const uint8_t *data) {
-  uint32_t rv = data[0] & 0x7f;
-  if (len > 1) {
-    rv |= ((uint32_t)(data[1] & 0x7f) << 7);
-    if (len > 2) {
-      rv |= ((uint32_t)(data[2] & 0x7f) << 14);
-      if (len > 3) {
-        rv |= ((uint32_t)(data[3] & 0x7f) << 21);
-        if (len > 4) rv |= ((uint32_t)(data[4]) << 28);
-      }
-    }
-  }
-  return rv;
-}
-
-static inline uint32_t parse_int32(unsigned len, const uint8_t *data) {
-  return parse_uint32(len, data);
-}
-
-static inline int32_t unzigzag32(uint32_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint32_t parse_fixed_uint32(const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint32_t t;
-  memcpy(&t, data, 4);
-  return t;
-#else
-  return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) |
-         ((uint32_t)(data[3]) << 24);
-#endif
-}
-
-static uint64_t parse_uint64(unsigned len, const uint8_t *data) {
-  unsigned shift, i;
-  uint64_t rv;
-
-  if (len < 5) return parse_uint32(len, data);
-  rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) |
-       ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21);
-  shift = 28;
-  for (i = 4; i < len; i++) {
-    rv |= (((uint64_t)(data[i] & 0x7f)) << shift);
-    shift += 7;
-  }
-  return rv;
-}
-
-static inline int64_t unzigzag64(uint64_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint64_t parse_fixed_uint64(const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint64_t t;
-  memcpy(&t, data, 8);
-  return t;
-#else
-  return (uint64_t)parse_fixed_uint32(data) |
-         (((uint64_t)parse_fixed_uint32(data + 4)) << 32);
-#endif
-}
-
-static protobuf_c_boolean parse_boolean(unsigned len, const uint8_t *data) {
-  unsigned i;
-  for (i = 0; i < len; i++)
-    if (data[i] & 0x7f) return TRUE;
-  return FALSE;
-}
-
-static protobuf_c_boolean parse_required_member(
-    ScannedMember *scanned_member, void *member, ProtobufCAllocator *allocator,
-    protobuf_c_boolean maybe_clear) {
-  unsigned len = scanned_member->len;
-  const uint8_t *data = scanned_member->data;
-  ProtobufCWireType wire_type = scanned_member->wire_type;
-
-  switch (scanned_member->field->type) {
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member = parse_int32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_UINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint32_t *)member = parse_uint32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member = unzigzag32(parse_uint32(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE;
-      *(uint32_t *)member = parse_fixed_uint32(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint64_t *)member = parse_uint64(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int64_t *)member = unzigzag64(parse_uint64(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE;
-      *(uint64_t *)member = parse_fixed_uint64(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *(protobuf_c_boolean *)member = parse_boolean(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING: {
-      char **pstr = member;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      if (maybe_clear && *pstr != NULL) {
-        const char *def = scanned_member->field->default_value;
-        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
-      }
-      *pstr = do_alloc(allocator, len - pref_len + 1);
-      if (*pstr == NULL) return FALSE;
-      memcpy(*pstr, data + pref_len, len - pref_len);
-      (*pstr)[len - pref_len] = 0;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      ProtobufCBinaryData *bd = member;
-      const ProtobufCBinaryData *def_bd;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_bd = scanned_member->field->default_value;
-      if (maybe_clear && bd->data != NULL &&
-          (def_bd == NULL || bd->data != def_bd->data)) {
-        do_free(allocator, bd->data);
-      }
-      if (len - pref_len > 0) {
-        bd->data = do_alloc(allocator, len - pref_len);
-        if (bd->data == NULL) return FALSE;
-        memcpy(bd->data, data + pref_len, len - pref_len);
-      } else {
-        bd->data = NULL;
-      }
-      bd->len = len - pref_len;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      ProtobufCMessage **pmessage = member;
-      ProtobufCMessage *subm;
-      const ProtobufCMessage *def_mess;
-      protobuf_c_boolean merge_successful = TRUE;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_mess = scanned_member->field->default_value;
-      subm =
-          protobuf_c_message_unpack(scanned_member->field->descriptor,
-                                    allocator, len - pref_len, data + pref_len);
-
-      if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) {
-        if (subm != NULL)
-          merge_successful = merge_messages(*pmessage, subm, allocator);
-        /* Delete the previous message */
-        protobuf_c_message_free_unpacked(*pmessage, allocator);
-      }
-      *pmessage = subm;
-      if (subm == NULL || !merge_successful) return FALSE;
-      return TRUE;
-    }
-  }
-  return FALSE;
-}
-
-static protobuf_c_boolean parse_oneof_member(ScannedMember *scanned_member,
-                                             void *member,
-                                             ProtobufCMessage *message,
-                                             ProtobufCAllocator *allocator) {
-  uint32_t *oneof_case = STRUCT_MEMBER_PTR(
-      uint32_t, message, scanned_member->field->quantifier_offset);
-
-  /* If we have already parsed a member of this oneof, free it. */
-  if (*oneof_case != 0) {
-    /* lookup field */
-    int field_index =
-        int_range_lookup(message->descriptor->n_field_ranges,
-                         message->descriptor->field_ranges, *oneof_case);
-    const ProtobufCFieldDescriptor *old_field =
-        message->descriptor->fields + field_index;
-    size_t el_size = sizeof_elt_in_repeated_array(old_field->type);
-
-    switch (old_field->type) {
-      case PROTOBUF_C_TYPE_STRING: {
-        char **pstr = member;
-        const char *def = old_field->default_value;
-        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
-        break;
-      }
-      case PROTOBUF_C_TYPE_BYTES: {
-        ProtobufCBinaryData *bd = member;
-        const ProtobufCBinaryData *def_bd = old_field->default_value;
-        if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) {
-          do_free(allocator, bd->data);
-        }
-        break;
-      }
-      case PROTOBUF_C_TYPE_MESSAGE: {
-        ProtobufCMessage **pmessage = member;
-        const ProtobufCMessage *def_mess = old_field->default_value;
-        if (*pmessage != NULL && *pmessage != def_mess)
-          protobuf_c_message_free_unpacked(*pmessage, allocator);
-        break;
-      }
-      default:
-        break;
-    }
-
-    memset(member, 0, el_size);
-  }
-  if (!parse_required_member(scanned_member, member, allocator, TRUE))
-    return FALSE;
-
-  *oneof_case = scanned_member->tag;
-  return TRUE;
-}
-
-static protobuf_c_boolean parse_optional_member(ScannedMember *scanned_member,
-                                                void *member,
-                                                ProtobufCMessage *message,
-                                                ProtobufCAllocator *allocator) {
-  if (!parse_required_member(scanned_member, member, allocator, TRUE))
-    return FALSE;
-  if (scanned_member->field->quantifier_offset != 0)
-    STRUCT_MEMBER(protobuf_c_boolean, message,
-                  scanned_member->field->quantifier_offset) = TRUE;
-  return TRUE;
-}
-
-static protobuf_c_boolean parse_repeated_member(ScannedMember *scanned_member,
-                                                void *member,
-                                                ProtobufCMessage *message,
-                                                ProtobufCAllocator *allocator) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz = sizeof_elt_in_repeated_array(field->type);
-  char *array = *(char **)member;
-
-  if (!parse_required_member(scanned_member, array + siz * (*p_n), allocator,
-                             FALSE)) {
-    return FALSE;
-  }
-  *p_n += 1;
-  return TRUE;
-}
-
-static unsigned scan_varint(unsigned len, const uint8_t *data) {
-  unsigned i;
-  if (len > 10) len = 10;
-  for (i = 0; i < len; i++)
-    if ((data[i] & 0x80) == 0) break;
-  if (i == len) return 0;
-  return i + 1;
-}
-
-static protobuf_c_boolean parse_packed_repeated_member(
-    ScannedMember *scanned_member, void *member, ProtobufCMessage *message) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz = sizeof_elt_in_repeated_array(field->type);
-  void *array = *(char **)member + siz * (*p_n);
-  const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len;
-  size_t rem = scanned_member->len - scanned_member->length_prefix_len;
-  size_t count = 0;
-  unsigned i;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 4;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint32_t *)array)[i] = parse_fixed_uint32(at);
-        at += 4;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 8;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint64_t *)array)[i] = parse_fixed_uint64(at);
-        at += 8;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] = parse_int32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] = unzigzag32(parse_uint32(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value");
-          return FALSE;
-        }
-        ((uint32_t *)array)[count++] = parse_uint32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-
-    case PROTOBUF_C_TYPE_SINT64:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] = unzigzag64(parse_uint64(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] = parse_uint64(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      count = rem;
-      for (i = 0; i < count; i++) {
-        if (at[i] > 1) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value");
-          return FALSE;
-        }
-        ((protobuf_c_boolean *)array)[i] = at[i];
-      }
-      break;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  *p_n += count;
-  return TRUE;
-
-#if !defined(WORDS_BIGENDIAN)
-no_unpacking_needed:
-  memcpy(array, at, count * siz);
-  *p_n += count;
-  return TRUE;
-#endif
-}
-
-static protobuf_c_boolean is_packable_type(ProtobufCType type) {
-  return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES &&
-         type != PROTOBUF_C_TYPE_MESSAGE;
-}
-
-static protobuf_c_boolean parse_member(ScannedMember *scanned_member,
-                                       ProtobufCMessage *message,
-                                       ProtobufCAllocator *allocator) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  void *member;
-
-  if (field == NULL) {
-    ProtobufCMessageUnknownField *ufield =
-        message->unknown_fields + (message->n_unknown_fields++);
-    ufield->tag = scanned_member->tag;
-    ufield->wire_type = scanned_member->wire_type;
-    ufield->len = scanned_member->len;
-    ufield->data = do_alloc(allocator, scanned_member->len);
-    if (ufield->data == NULL) return FALSE;
-    memcpy(ufield->data, scanned_member->data, ufield->len);
-    return TRUE;
-  }
-  member = (char *)message + field->offset;
-  switch (field->label) {
-    case PROTOBUF_C_LABEL_REQUIRED:
-      return parse_required_member(scanned_member, member, allocator, TRUE);
-    case PROTOBUF_C_LABEL_OPTIONAL:
-    case PROTOBUF_C_LABEL_NONE:
-      if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) {
-        return parse_oneof_member(scanned_member, member, message, allocator);
-      } else {
-        return parse_optional_member(scanned_member, member, message,
-                                     allocator);
-      }
-    case PROTOBUF_C_LABEL_REPEATED:
-      if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           is_packable_type(field->type))) {
-        return parse_packed_repeated_member(scanned_member, member, message);
-      } else {
-        return parse_repeated_member(scanned_member, member, message,
-                                     allocator);
-      }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Initialise messages generated by old code.
- *
- * This function is used if desc->message_init == NULL (which occurs
- * for old code, and which would be useful to support allocating
- * descriptors dynamically).
- */
-static void message_init_generic(const ProtobufCMessageDescriptor *desc,
-                                 ProtobufCMessage *message) {
-  unsigned i;
-
-  memset(message, 0, desc->sizeof_message);
-  message->descriptor = desc;
-  for (i = 0; i < desc->n_fields; i++) {
-    if (desc->fields[i].default_value != NULL &&
-        desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) {
-      void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset);
-      const void *dv = desc->fields[i].default_value;
-
-      switch (desc->fields[i].type) {
-        case PROTOBUF_C_TYPE_INT32:
-        case PROTOBUF_C_TYPE_SINT32:
-        case PROTOBUF_C_TYPE_SFIXED32:
-        case PROTOBUF_C_TYPE_UINT32:
-        case PROTOBUF_C_TYPE_FIXED32:
-        case PROTOBUF_C_TYPE_FLOAT:
-        case PROTOBUF_C_TYPE_ENUM:
-          memcpy(field, dv, 4);
-          break;
-        case PROTOBUF_C_TYPE_INT64:
-        case PROTOBUF_C_TYPE_SINT64:
-        case PROTOBUF_C_TYPE_SFIXED64:
-        case PROTOBUF_C_TYPE_UINT64:
-        case PROTOBUF_C_TYPE_FIXED64:
-        case PROTOBUF_C_TYPE_DOUBLE:
-          memcpy(field, dv, 8);
-          break;
-        case PROTOBUF_C_TYPE_BOOL:
-          memcpy(field, dv, sizeof(protobuf_c_boolean));
-          break;
-        case PROTOBUF_C_TYPE_BYTES:
-          memcpy(field, dv, sizeof(ProtobufCBinaryData));
-          break;
-
-        case PROTOBUF_C_TYPE_STRING:
-        case PROTOBUF_C_TYPE_MESSAGE:
-          /*
-           * The next line essentially implements a cast
-           * from const, which is totally unavoidable.
-           */
-          *(const void **)field = dv;
-          break;
-      }
-    }
-  }
-}
-
-/**@}*/
-
-/*
- * ScannedMember slabs (an unpacking implementation detail). Before doing real
- * unpacking, we first scan through the elements to see how many there are (for
- * repeated fields), and which field to use (for non-repeated fields given
- * twice).
- *
- * In order to avoid allocations for small messages, we keep a stack-allocated
- * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we
- * fill that up, we allocate each slab twice as large as the previous one.
- */
-#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4
-
-/*
- * The number of slabs, including the stack-allocated ones; choose the number so
- * that we would overflow if we needed a slab larger than provided.
- */
-#define MAX_SCANNED_MEMBER_SLAB                                      \
-  (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \
-   FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2)
-
-#define REQUIRED_FIELD_BITMAP_SET(index) \
-  (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8)))
-
-#define REQUIRED_FIELD_BITMAP_IS_SET(index) \
-  (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8)))
-
-ProtobufCMessage *protobuf_c_message_unpack(
-    const ProtobufCMessageDescriptor *desc, ProtobufCAllocator *allocator,
-    size_t len, const uint8_t *data) {
-  ProtobufCMessage *rv;
-  size_t rem = len;
-  const uint8_t *at = data;
-  const ProtobufCFieldDescriptor *last_field = desc->fields + 0;
-  ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2];
-
-  /*
-   * scanned_member_slabs[i] is an array of arrays of ScannedMember.
-   * The first slab (scanned_member_slabs[0] is just a pointer to
-   * first_member_slab), above. All subsequent slabs will be allocated
-   * using the allocator.
-   */
-  ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1];
-  unsigned which_slab = 0;    /* the slab we are currently populating */
-  unsigned in_slab_index = 0; /* number of members in the slab */
-  size_t n_unknown = 0;
-  unsigned f;
-  unsigned j;
-  unsigned i_slab;
-  unsigned last_field_index = 0;
-  unsigned required_fields_bitmap_len;
-  unsigned char required_fields_bitmap_stack[16];
-  unsigned char *required_fields_bitmap = required_fields_bitmap_stack;
-  protobuf_c_boolean required_fields_bitmap_alloced = FALSE;
-
-  ASSERT_IS_MESSAGE_DESCRIPTOR(desc);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-
-  rv = do_alloc(allocator, desc->sizeof_message);
-  if (!rv) return (NULL);
-  scanned_member_slabs[0] = first_member_slab;
-
-  required_fields_bitmap_len = (desc->n_fields + 7) / 8;
-  if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) {
-    required_fields_bitmap = do_alloc(allocator, required_fields_bitmap_len);
-    if (!required_fields_bitmap) {
-      do_free(allocator, rv);
-      return (NULL);
-    }
-    required_fields_bitmap_alloced = TRUE;
-  }
-  memset(required_fields_bitmap, 0, required_fields_bitmap_len);
-
-  /*
-   * Generated code always defines "message_init". However, we provide a
-   * fallback for (1) users of old protobuf-c generated-code that do not
-   * provide the function, and (2) descriptors constructed from some other
-   * source (most likely, direct construction from the .proto file).
-   */
-  if (desc->message_init != NULL)
-    protobuf_c_message_init(desc, rv);
-  else
-    message_init_generic(desc, rv);
-
-  while (rem > 0) {
-    uint32_t tag;
-    ProtobufCWireType wire_type;
-    size_t used = parse_tag_and_wiretype(rem, at, &tag, &wire_type);
-    const ProtobufCFieldDescriptor *field;
-    ScannedMember tmp;
-
-    if (used == 0) {
-      PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u",
-                              (unsigned)(at - data));
-      goto error_cleanup_during_scan;
-    }
-    /*
-     * \todo Consider optimizing for field[1].id == tag, if field[1]
-     * exists!
-     */
-    if (last_field == NULL || last_field->id != tag) {
-      /* lookup field */
-      int field_index =
-          int_range_lookup(desc->n_field_ranges, desc->field_ranges, tag);
-      if (field_index < 0) {
-        field = NULL;
-        n_unknown++;
-      } else {
-        field = desc->fields + field_index;
-        last_field = field;
-        last_field_index = field_index;
-      }
-    } else {
-      field = last_field;
-    }
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED)
-      REQUIRED_FIELD_BITMAP_SET(last_field_index);
-
-    at += used;
-    rem -= used;
-    tmp.tag = tag;
-    tmp.wire_type = wire_type;
-    tmp.field = field;
-    tmp.data = at;
-    tmp.length_prefix_len = 0;
-
-    switch (wire_type) {
-      case PROTOBUF_C_WIRE_TYPE_VARINT: {
-        unsigned max_len = rem < 10 ? rem : 10;
-        unsigned i;
-
-        for (i = 0; i < max_len; i++)
-          if ((at[i] & 0x80) == 0) break;
-        if (i == max_len) {
-          PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = i + 1;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_64BIT:
-        if (rem < 8) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 8;
-        break;
-      case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: {
-        size_t pref_len;
-
-        tmp.len = scan_length_prefixed_data(rem, at, &pref_len);
-        if (tmp.len == 0) {
-          /* NOTE: scan_length_prefixed_data calls UNPACK_ERROR */
-          goto error_cleanup_during_scan;
-        }
-        tmp.length_prefix_len = pref_len;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_32BIT:
-        if (rem < 4) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 4;
-        break;
-      default:
-        PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type,
-                                (unsigned)(at - data));
-        goto error_cleanup_during_scan;
-    }
-
-    if (in_slab_index ==
-        (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) {
-      size_t size;
-
-      in_slab_index = 0;
-      if (which_slab == MAX_SCANNED_MEMBER_SLAB) {
-        PROTOBUF_C_UNPACK_ERROR("too many fields");
-        goto error_cleanup_during_scan;
-      }
-      which_slab++;
-      size = sizeof(ScannedMember)
-             << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2);
-      scanned_member_slabs[which_slab] = do_alloc(allocator, size);
-      if (scanned_member_slabs[which_slab] == NULL)
-        goto error_cleanup_during_scan;
-    }
-    scanned_member_slabs[which_slab][in_slab_index++] = tmp;
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           is_packable_type(field->type))) {
-        size_t count;
-        if (!count_packed_elements(field->type, tmp.len - tmp.length_prefix_len,
-                                   tmp.data + tmp.length_prefix_len, &count)) {
-          PROTOBUF_C_UNPACK_ERROR("counting packed elements");
-          goto error_cleanup_during_scan;
-        }
-        *n += count;
-      } else {
-        *n += 1;
-      }
-    }
-
-    at += tmp.len;
-    rem -= tmp.len;
-  }
-
-  /* allocate space for repeated fields, also check that all required fields
-   * have been set */
-  for (f = 0; f < desc->n_fields; f++) {
-    const ProtobufCFieldDescriptor *field = desc->fields + f;
-    if (field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t siz = sizeof_elt_in_repeated_array(field->type);
-      size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (*n_ptr != 0) {
-        unsigned n = *n_ptr;
-        void *a;
-        *n_ptr = 0;
-        assert(rv->descriptor != NULL);
-#define CLEAR_REMAINING_N_PTRS()                               \
-  for (f++; f < desc->n_fields; f++) {                         \
-    field = desc->fields + f;                                  \
-    if (field->label == PROTOBUF_C_LABEL_REPEATED)             \
-      STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \
-  }
-        a = do_alloc(allocator, siz * n);
-        if (!a) {
-          CLEAR_REMAINING_N_PTRS();
-          goto error_cleanup;
-        }
-        STRUCT_MEMBER(void *, rv, field->offset) = a;
-      }
-    } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) {
-        CLEAR_REMAINING_N_PTRS();
-        PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'",
-                                desc->name, field->name);
-        goto error_cleanup;
-      }
-    }
-  }
-#undef CLEAR_REMAINING_N_PTRS
-
-  /* allocate space for unknown fields */
-  if (n_unknown) {
-    rv->unknown_fields =
-        do_alloc(allocator, n_unknown * sizeof(ProtobufCMessageUnknownField));
-    if (rv->unknown_fields == NULL) goto error_cleanup;
-  }
-
-  /* do real parsing */
-  for (i_slab = 0; i_slab <= which_slab; i_slab++) {
-    unsigned max =
-        (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4));
-    ScannedMember *slab = scanned_member_slabs[i_slab];
-
-    for (j = 0; j < max; j++) {
-      if (!parse_member(slab + j, rv, allocator)) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "error parsing member %s of %s",
-            slab->field ? slab->field->name : "*unknown-field*", desc->name);
-        goto error_cleanup;
-      }
-    }
-  }
-
-  /* cleanup */
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return rv;
-
-error_cleanup:
-  protobuf_c_message_free_unpacked(rv, allocator);
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return NULL;
-
-error_cleanup_during_scan:
-  do_free(allocator, rv);
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return NULL;
-}
-
-void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
-                                      ProtobufCAllocator *allocator) {
-  const ProtobufCMessageDescriptor *desc;
-  unsigned f;
-
-  if (message == NULL) return;
-
-  desc = message->descriptor;
-
-  ASSERT_IS_MESSAGE(message);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-  message->descriptor = NULL;
-  for (f = 0; f < desc->n_fields; f++) {
-    if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) &&
-        desc->fields[f].id !=
-            STRUCT_MEMBER(uint32_t, message,
-                          desc->fields[f].quantifier_offset)) {
-      /* This is not the selected oneof, skip it */
-      continue;
-    }
-
-    if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t n =
-          STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset);
-      void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset);
-
-      if (arr != NULL) {
-        if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-          unsigned i;
-          for (i = 0; i < n; i++) do_free(allocator, ((char **)arr)[i]);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            do_free(allocator, ((ProtobufCBinaryData *)arr)[i].data);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            protobuf_c_message_free_unpacked(((ProtobufCMessage **)arr)[i],
-                                             allocator);
-        }
-        do_free(allocator, arr);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-      char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset);
-
-      if (str && str != desc->fields[f].default_value) do_free(allocator, str);
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-      void *data =
-          STRUCT_MEMBER(ProtobufCBinaryData, message, desc->fields[f].offset)
-              .data;
-      const ProtobufCBinaryData *default_bd;
-
-      default_bd = desc->fields[f].default_value;
-      if (data != NULL && (default_bd == NULL || default_bd->data != data)) {
-        do_free(allocator, data);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-      ProtobufCMessage *sm;
-
-      sm = STRUCT_MEMBER(ProtobufCMessage *, message, desc->fields[f].offset);
-      if (sm && sm != desc->fields[f].default_value)
-        protobuf_c_message_free_unpacked(sm, allocator);
-    }
-  }
-
-  for (f = 0; f < message->n_unknown_fields; f++)
-    do_free(allocator, message->unknown_fields[f].data);
-  if (message->unknown_fields != NULL)
-    do_free(allocator, message->unknown_fields);
-
-  do_free(allocator, message);
-}
-
-void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
-                             void *message) {
-  descriptor->message_init((ProtobufCMessage *)(message));
-}
-
-protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
-  unsigned i;
-
-  if (!message || !message->descriptor ||
-      message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) {
-    return FALSE;
-  }
-
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const ProtobufCFieldDescriptor *f = message->descriptor->fields + i;
-    ProtobufCType type = f->type;
-    ProtobufCLabel label = f->label;
-    void *field = STRUCT_MEMBER_P(message, f->offset);
-
-    if (label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *quantity = STRUCT_MEMBER_P(message, f->quantifier_offset);
-
-      if (*quantity > 0 && *(void **)field == NULL) {
-        return FALSE;
-      }
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        ProtobufCMessage **submessage = *(ProtobufCMessage ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!protobuf_c_message_check(submessage[j])) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char **string = *(char ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!string[j]) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        ProtobufCBinaryData *bd = *(ProtobufCBinaryData **)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (bd[j].len > 0 && bd[j].data == NULL) return FALSE;
-        }
-      }
-
-    } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        ProtobufCMessage *submessage = *(ProtobufCMessage **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) {
-          if (!protobuf_c_message_check(submessage)) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char *string = *(char **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE;
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        protobuf_c_boolean *has =
-            STRUCT_MEMBER_P(message, f->quantifier_offset);
-        ProtobufCBinaryData *bd = field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) {
-          if (bd->len > 0 && bd->data == NULL) return FALSE;
-        }
-      }
-    }
-  }
-
-  return TRUE;
-}
-
-/* === services === */
-
-typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
-                               ProtobufCClosure closure, void *closure_data);
diff --git a/mobile/tools/quantification/src/protobuf-c.h b/mobile/tools/quantification/src/protobuf-c.h
deleted file mode 100644
index bd85695b86..0000000000
--- a/mobile/tools/quantification/src/protobuf-c.h
+++ /dev/null
@@ -1,921 +0,0 @@
-/*
- * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * \mainpage Introduction
- *
- * This is [protobuf-c], a C implementation of [Protocol Buffers].
- *
- * This file defines the public API for the `libprotobuf-c` support library.
- * This API includes interfaces that can be used directly by client code as well
- * as the interfaces used by the code generated by the `protoc-c` compiler.
- *
- * The `libprotobuf-c` support library performs the actual serialization and
- * deserialization of Protocol Buffers messages. It interacts with structures,
- * definitions, and metadata generated by the `protoc-c` compiler from .proto
- * files.
- *
- * \authors Dave Benson and the `protobuf-c` authors.
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- *
- * [protobuf-c]:       https://github.com/protobuf-c/protobuf-c
- * [Protocol Buffers]: https://developers.google.com/protocol-buffers/
- * [BSD-2-Clause]:     http://opensource.org/licenses/BSD-2-Clause
- *
- * \page gencode Generated Code
- *
- * For each enum, we generate a C enum. For each message, we generate a C
- * structure which can be cast to a `ProtobufCMessage`.
- *
- * For each enum and message, we generate a descriptor object that allows us to
- * implement a kind of reflection on the structures.
- *
- * First, some naming conventions:
- *
- * - The name of the type for enums and messages and services is camel case
- *   (meaning WordsAreCrammedTogether) except that double underscores are used
- *   to delimit scopes. For example, the following `.proto` file:
- *
-~~~{.proto}
-        package foo.bar;
-        message BazBah {
-            optional int32 val = 1;
-        }
-~~~
- *
- * would generate a C type `Foo__Bar__BazBah`.
- *
- * - Identifiers for functions and globals are all lowercase, with camel case
- *   words separated by single underscores. For example, one of the function
- *   prototypes generated by `protoc-c` for the above example:
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - Identifiers for enum values contain an uppercase prefix which embeds the
- *   package name and the enum type name.
- *
- * - A double underscore is used to separate further components of identifier
- *   names.
- *
- * For example, in the name of the unpack function above, the package name
- * `foo.bar` has become `foo__bar`, the message name BazBah has become
- * `baz_bah`, and the method name is `unpack`. These are all joined with double
- * underscores to form the C identifier `foo__bar__baz_bah__unpack`.
- *
- * We also generate descriptor objects for messages and enums. These are
- * declared in the `.pb-c.h` files:
- *
-~~~{.c}
-extern const ProtobufCMessageDescriptor foo__bar__baz_bah__descriptor;
-~~~
- *
- * The message structures all begin with `ProtobufCMessageDescriptor *` which is
- * sufficient to allow them to be cast to `ProtobufCMessage`.
- *
- * For each message defined in a `.proto` file, we generate a number of
- * functions and macros. Each function name contains a prefix based on the
- * package name and message name in order to make it a unique C identifier.
- *
- * - `INIT`. Statically initializes a message object, initializing its
- *   descriptor and setting its fields to default values. Uninitialized
- *   messages cannot be processed by the protobuf-c library.
- *
-~~~{.c}
-#define FOO__BAR__BAZ_BAH__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 }
-~~~
- * - `init()`. Initializes a message object, initializing its descriptor and
- *   setting its fields to default values. Uninitialized messages cannot be
- *   processed by the protobuf-c library.
- *
-~~~{.c}
-void foo__bar__baz_bah__init
-                     (Foo__Bar__BazBah *message);
-~~~
- * - `unpack()`. Unpacks data for a particular message format. Note that the
- *   `allocator` parameter is usually `NULL` to indicate that the system's
- *   `malloc()` and `free()` functions should be used for dynamically allocating
- *   memory.
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - `free_unpacked()`. Frees a message object obtained with the `unpack()`
- *   method. Freeing `NULL` is allowed (the same as with `free()`).
- *
-~~~{.c}
-void   foo__bar__baz_bah__free_unpacked
-                     (Foo__Bar__BazBah *message,
-                      ProtobufCAllocator *allocator);
-~~~
- *
- * - `get_packed_size()`. Calculates the length in bytes of the serialized
- *   representation of the message object.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__get_packed_size
-                     (const Foo__Bar__BazBah   *message);
-~~~
- *
- * - `pack()`. Pack a message object into a preallocated buffer. Assumes that
- *   the buffer is large enough. (Use `get_packed_size()` first.)
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack
-                     (const Foo__Bar__BazBah   *message,
-                      uint8_t             *out);
-~~~
- *
- * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an
- *   object which defines an "append bytes" callback to consume data as it is
- *   serialized.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack_to_buffer
-                     (const Foo__Bar__BazBah   *message,
-                      ProtobufCBuffer     *buffer);
-~~~
- *
- * \page pack Packing and unpacking messages
- *
- * To pack a message, first compute the packed size of the message with
- * protobuf_c_message_get_packed_size(), then allocate a buffer of at least
- * that size, then call protobuf_c_message_pack().
- *
- * Alternatively, a message can be serialized without calculating the final size
- * first. Use the protobuf_c_message_pack_to_buffer() function and provide a
- * ProtobufCBuffer object which implements an "append" method that consumes
- * data.
- *
- * To unpack a message, call the protobuf_c_message_unpack() function. The
- * result can be cast to an object of the type that matches the descriptor for
- * the message.
- *
- * The result of unpacking a message should be freed with
- * protobuf_c_message_free_unpacked().
- */
-
-#ifndef PROTOBUF_C_H
-#define PROTOBUF_C_H
-
-#include 
-#include 
-#include 
-#include 
-
-#ifdef __cplusplus
-#define PROTOBUF_C__BEGIN_DECLS extern "C" {
-#define PROTOBUF_C__END_DECLS }
-#else
-#define PROTOBUF_C__BEGIN_DECLS
-#define PROTOBUF_C__END_DECLS
-#endif
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
-#ifdef PROTOBUF_C_EXPORT
-#define PROTOBUF_C__API __declspec(dllexport)
-#else
-#define PROTOBUF_C__API __declspec(dllimport)
-#endif
-#else
-#define PROTOBUF_C__API
-#endif
-
-#if !defined(PROTOBUF_C__NO_DEPRECATED) && \
-    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
-#else
-#define PROTOBUF_C__DEPRECATED
-#endif
-
-#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
-#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
-  , _##enum_name##_IS_INT_SIZE = INT_MAX
-#endif
-
-#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
-#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
-#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
-
-/* Empty string used for initializers */
-extern const char protobuf_c_empty_string[];
-
-/**
- * \defgroup api Public API
- *
- * This is the public API for `libprotobuf-c`. These interfaces are stable and
- * subject to Semantic Versioning guarantees.
- *
- * @{
- */
-
-/**
- * Values for the `flags` word in `ProtobufCFieldDescriptor`.
- */
-typedef enum {
-  /** Set if the field is repeated and marked with the `packed` option. */
-  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
-
-  /** Set if the field is marked with the `deprecated` option. */
-  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
-
-  /** Set if the field is a member of a oneof (union). */
-  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
-} ProtobufCFieldFlag;
-
-/**
- * Message field rules.
- *
- * \see [Defining A Message Type] in the Protocol Buffers documentation.
- *
- * [Defining A Message Type]:
- *      https://developers.google.com/protocol-buffers/docs/proto#simple
- */
-typedef enum {
-  /** A well-formed message must have exactly one of this field. */
-  PROTOBUF_C_LABEL_REQUIRED,
-
-  /**
-   * A well-formed message can have zero or one of this field (but not
-   * more than one).
-   */
-  PROTOBUF_C_LABEL_OPTIONAL,
-
-  /**
-   * This field can be repeated any number of times (including zero) in a
-   * well-formed message. The order of the repeated values will be
-   * preserved.
-   */
-  PROTOBUF_C_LABEL_REPEATED,
-
-  /**
-   * This field has no label. This is valid only in proto3 and is
-   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
-   */
-  PROTOBUF_C_LABEL_NONE,
-} ProtobufCLabel;
-
-/**
- * Field value types.
- *
- * \see [Scalar Value Types] in the Protocol Buffers documentation.
- *
- * [Scalar Value Types]:
- *      https://developers.google.com/protocol-buffers/docs/proto#scalar
- */
-typedef enum {
-  PROTOBUF_C_TYPE_INT32,    /**< int32 */
-  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
-  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
-  PROTOBUF_C_TYPE_INT64,    /**< int64 */
-  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
-  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
-  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
-  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
-  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
-  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
-  PROTOBUF_C_TYPE_FLOAT,    /**< float */
-  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
-  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
-  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
-  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
-  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
-  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
-} ProtobufCType;
-
-/**
- * Field wire types.
- *
- * \see [Message Structure] in the Protocol Buffers documentation.
- *
- * [Message Structure]:
- *      https://developers.google.com/protocol-buffers/docs/encoding#structure
- */
-typedef enum {
-  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
-  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
-  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
-  /* "Start group" and "end group" wire types are unsupported. */
-  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
-} ProtobufCWireType;
-
-struct ProtobufCAllocator;
-struct ProtobufCBinaryData;
-struct ProtobufCBuffer;
-struct ProtobufCBufferSimple;
-struct ProtobufCEnumDescriptor;
-struct ProtobufCEnumValue;
-struct ProtobufCEnumValueIndex;
-struct ProtobufCFieldDescriptor;
-struct ProtobufCIntRange;
-struct ProtobufCMessage;
-struct ProtobufCMessageDescriptor;
-struct ProtobufCMessageUnknownField;
-struct ProtobufCMethodDescriptor;
-struct ProtobufCService;
-struct ProtobufCServiceDescriptor;
-
-typedef struct ProtobufCAllocator ProtobufCAllocator;
-typedef struct ProtobufCBinaryData ProtobufCBinaryData;
-typedef struct ProtobufCBuffer ProtobufCBuffer;
-typedef struct ProtobufCBufferSimple ProtobufCBufferSimple;
-typedef struct ProtobufCEnumDescriptor ProtobufCEnumDescriptor;
-typedef struct ProtobufCEnumValue ProtobufCEnumValue;
-typedef struct ProtobufCEnumValueIndex ProtobufCEnumValueIndex;
-typedef struct ProtobufCFieldDescriptor ProtobufCFieldDescriptor;
-typedef struct ProtobufCIntRange ProtobufCIntRange;
-typedef struct ProtobufCMessage ProtobufCMessage;
-typedef struct ProtobufCMessageDescriptor ProtobufCMessageDescriptor;
-typedef struct ProtobufCMessageUnknownField ProtobufCMessageUnknownField;
-typedef struct ProtobufCMethodDescriptor ProtobufCMethodDescriptor;
-typedef struct ProtobufCService ProtobufCService;
-typedef struct ProtobufCServiceDescriptor ProtobufCServiceDescriptor;
-
-/** Boolean type. */
-typedef int protobuf_c_boolean;
-
-typedef void (*ProtobufCClosure)(const ProtobufCMessage *, void *closure_data);
-typedef void (*ProtobufCMessageInit)(ProtobufCMessage *);
-typedef void (*ProtobufCServiceDestroy)(ProtobufCService *);
-
-/**
- * Structure for defining a custom memory allocator.
- */
-struct ProtobufCAllocator {
-  /** Function to allocate memory. */
-  void *(*alloc)(void *allocator_data, size_t size);
-
-  /** Function to free memory. */
-  void (*free)(void *allocator_data, void *pointer);
-
-  /** Opaque pointer passed to `alloc` and `free` functions. */
-  void *allocator_data;
-};
-
-/**
- * Structure for the protobuf `bytes` scalar type.
- *
- * The data contained in a `ProtobufCBinaryData` is an arbitrary sequence of
- * bytes. It may contain embedded `NUL` characters and is not required to be
- * `NUL`-terminated.
- */
-struct ProtobufCBinaryData {
-  size_t len;    /**< Number of bytes in the `data` field. */
-  uint8_t *data; /**< Data bytes. */
-};
-
-/**
- * Structure for defining a virtual append-only buffer. Used by
- * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized
- * bytes.
- *
- * `ProtobufCBuffer` "subclasses" may be defined on the stack. For example, to
- * write to a `FILE` object:
- *
-~~~{.c}
-typedef struct {
-        ProtobufCBuffer base;
-        FILE *fp;
-} BufferAppendToFile;
-
-static void
-my_buffer_file_append(ProtobufCBuffer *buffer,
-                      size_t len,
-                      const uint8_t *data)
-{
-        BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer;
-        fwrite(data, len, 1, file_buf->fp); // XXX: No error handling!
-}
-~~~
- *
- * To use this new type of ProtobufCBuffer, it could be called as follows:
- *
-~~~{.c}
-...
-BufferAppendToFile tmp = {0};
-tmp.base.append = my_buffer_file_append;
-tmp.fp = fp;
-protobuf_c_message_pack_to_buffer(&message, &tmp);
-...
-~~~
- */
-struct ProtobufCBuffer {
-  /** Append function. Consumes the `len` bytes stored at `data`. */
-  void (*append)(ProtobufCBuffer *buffer, size_t len, const uint8_t *data);
-};
-
-/**
- * Simple buffer "subclass" of `ProtobufCBuffer`.
- *
- * A `ProtobufCBufferSimple` object is declared on the stack and uses a
- * scratch buffer provided by the user for the initial allocation. It performs
- * exponential resizing, using dynamically allocated memory. A
- * `ProtobufCBufferSimple` object can be created and used as follows:
- *
-~~~{.c}
-uint8_t pad[128];
-ProtobufCBufferSimple simple = PROTOBUF_C_BUFFER_SIMPLE_INIT(pad);
-ProtobufCBuffer *buffer = (ProtobufCBuffer *) &simple;
-~~~
- *
- * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a
- * message has been serialized to a `ProtobufCBufferSimple` object, the
- * serialized data bytes can be accessed from the `.data` field.
- *
- * To free the memory allocated by a `ProtobufCBufferSimple` object, if any,
- * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example:
- *
-~~~{.c}
-PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
-~~~
- *
- * \see PROTOBUF_C_BUFFER_SIMPLE_INIT
- * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
- */
-struct ProtobufCBufferSimple {
-  /** "Base class". */
-  ProtobufCBuffer base;
-  /** Number of bytes allocated in `data`. */
-  size_t alloced;
-  /** Number of bytes currently stored in `data`. */
-  size_t len;
-  /** Data bytes. */
-  uint8_t *data;
-  /** Whether `data` must be freed. */
-  protobuf_c_boolean must_free_data;
-  /** Allocator to use. May be NULL to indicate the system allocator. */
-  ProtobufCAllocator *allocator;
-};
-
-/**
- * Describes an enumeration as a whole, with all of its values.
- */
-struct ProtobufCEnumDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /** Number elements in `values`. */
-  unsigned n_values;
-  /** Array of distinct values, sorted by numeric value. */
-  const ProtobufCEnumValue *values;
-
-  /** Number of elements in `values_by_name`. */
-  unsigned n_value_names;
-  /** Array of named values, including aliases, sorted by name. */
-  const ProtobufCEnumValueIndex *values_by_name;
-
-  /** Number of elements in `value_ranges`. */
-  unsigned n_value_ranges;
-  /** Value ranges, for faster lookups by numeric value. */
-  const ProtobufCIntRange *value_ranges;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-  /** Reserved for future use. */
-  void *reserved4;
-};
-
-/**
- * Represents a single value of an enumeration.
- */
-struct ProtobufCEnumValue {
-  /** The string identifying this value in the .proto file. */
-  const char *name;
-
-  /** The string identifying this value in generated C code. */
-  const char *c_name;
-
-  /** The numeric value assigned in the .proto file. */
-  int value;
-};
-
-/**
- * Used by `ProtobufCEnumDescriptor` to look up enum values.
- */
-struct ProtobufCEnumValueIndex {
-  /** Name of the enum value. */
-  const char *name;
-  /** Index into values[] array. */
-  unsigned index;
-};
-
-/**
- * Describes a single field in a message.
- */
-struct ProtobufCFieldDescriptor {
-  /** Name of the field as given in the .proto file. */
-  const char *name;
-
-  /** Tag value of the field as given in the .proto file. */
-  uint32_t id;
-
-  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
-  ProtobufCLabel label;
-
-  /** The type of the field. */
-  ProtobufCType type;
-
-  /**
-   * The offset in bytes of the message's C structure's quantifier field
-   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
-   * for repeated members or the case enum for oneofs).
-   */
-  unsigned quantifier_offset;
-
-  /**
-   * The offset in bytes into the message's C structure for the member
-   * itself.
-   */
-  unsigned offset;
-
-  /**
-   * A type-specific descriptor.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
-   * corresponding `ProtobufCEnumDescriptor`.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
-   * the corresponding `ProtobufCMessageDescriptor`.
-   *
-   * Otherwise this field is NULL.
-   */
-  const void *descriptor; /* for MESSAGE and ENUM types */
-
-  /** The default value for this field, if defined. May be NULL. */
-  const void *default_value;
-
-  /**
-   * A flag word. Zero or more of the bits defined in the
-   * `ProtobufCFieldFlag` enum may be set.
-   */
-  uint32_t flags;
-
-  /** Reserved for future use. */
-  unsigned reserved_flags;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * Helper structure for optimizing int => index lookups in the case
- * where the keys are mostly consecutive values, as they presumably are for
- * enums and fields.
- *
- * The data structures requires that the values in the original array are
- * sorted.
- */
-struct ProtobufCIntRange {
-  int start_value;
-  unsigned orig_index;
-  /*
-   * NOTE: the number of values in the range can be inferred by looking
-   * at the next element's orig_index. A dummy element is added to make
-   * this simple.
-   */
-};
-
-/**
- * An instance of a message.
- *
- * `ProtobufCMessage` is a light-weight "base class" for all messages.
- *
- * In particular, `ProtobufCMessage` doesn't have any allocation policy
- * associated with it. That's because it's common to create `ProtobufCMessage`
- * objects on the stack. In fact, that's what we recommend for sending messages.
- * If the object is allocated from the stack, you can't really have a memory
- * leak.
- *
- * This means that calls to functions like protobuf_c_message_unpack() which
- * return a `ProtobufCMessage` must be paired with a call to a free function,
- * like protobuf_c_message_free_unpacked().
- */
-struct ProtobufCMessage {
-  /** The descriptor for this message type. */
-  const ProtobufCMessageDescriptor *descriptor;
-  /** The number of elements in `unknown_fields`. */
-  unsigned n_unknown_fields;
-  /** The fields that weren't recognized by the parser. */
-  ProtobufCMessageUnknownField *unknown_fields;
-};
-
-/**
- * Describes a message.
- */
-struct ProtobufCMessageDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /**
-   * Size in bytes of the C structure representing an instance of this
-   * type of message.
-   */
-  size_t sizeof_message;
-
-  /** Number of elements in `fields`. */
-  unsigned n_fields;
-  /** Field descriptors, sorted by tag number. */
-  const ProtobufCFieldDescriptor *fields;
-  /** Used for looking up fields by name. */
-  const unsigned *fields_sorted_by_name;
-
-  /** Number of elements in `field_ranges`. */
-  unsigned n_field_ranges;
-  /** Used for looking up fields by id. */
-  const ProtobufCIntRange *field_ranges;
-
-  /** Message initialisation function. */
-  ProtobufCMessageInit message_init;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * An unknown message field.
- */
-struct ProtobufCMessageUnknownField {
-  /** The tag number. */
-  uint32_t tag;
-  /** The wire type of the field. */
-  ProtobufCWireType wire_type;
-  /** Number of bytes in `data`. */
-  size_t len;
-  /** Field data. */
-  uint8_t *data;
-};
-
-/**
- * Method descriptor.
- */
-struct ProtobufCMethodDescriptor {
-  /** Method name. */
-  const char *name;
-  /** Input message descriptor. */
-  const ProtobufCMessageDescriptor *input;
-  /** Output message descriptor. */
-  const ProtobufCMessageDescriptor *output;
-};
-
-/**
- * Service.
- */
-struct ProtobufCService {
-  /** Service descriptor. */
-  const ProtobufCServiceDescriptor *descriptor;
-  /** Function to invoke the service. */
-  void (*invoke)(ProtobufCService *service, unsigned method_index,
-                 const ProtobufCMessage *input, ProtobufCClosure closure,
-                 void *closure_data);
-  /** Function to destroy the service. */
-  void (*destroy)(ProtobufCService *service);
-};
-
-/**
- * Service descriptor.
- */
-struct ProtobufCServiceDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** Service name. */
-  const char *name;
-  /** Short version of service name. */
-  const char *short_name;
-  /** C identifier for the service name. */
-  const char *c_name;
-  /** Package name. */
-  const char *package;
-  /** Number of elements in `methods`. */
-  unsigned n_methods;
-  /** Method descriptors, in the order defined in the .proto file. */
-  const ProtobufCMethodDescriptor *methods;
-  /** Sort index of methods. */
-  const unsigned *method_indices_by_name;
-};
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A string containing the version number of protobuf-c.
- */
-PROTOBUF_C__API
-const char *protobuf_c_version(void);
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A 32 bit unsigned integer containing the version number of
- *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
- */
-PROTOBUF_C__API
-uint32_t protobuf_c_version_number(void);
-
-/**
- * The version of the protobuf-c headers, represented as a string using the same
- * format as protobuf_c_version().
- */
-#define PROTOBUF_C_VERSION "1.3.0"
-
-/**
- * The version of the protobuf-c headers, represented as an integer using the
- * same format as protobuf_c_version_number().
- */
-#define PROTOBUF_C_VERSION_NUMBER 1003000
-
-/**
- * The minimum protoc-c version which works with the current version of the
- * protobuf-c headers.
- */
-#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
-
-/**
- * Determine the number of bytes required to store the serialised message.
- *
- * \param message
- *      The message object to serialise.
- * \return
- *      Number of bytes.
- */
-PROTOBUF_C__API
-size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
-
-/**
- * Unpack a serialised message into an in-memory representation.
- *
- * \param descriptor
- *      The message descriptor.
- * \param allocator
- *      `ProtobufCAllocator` to use for memory allocation. May be NULL to
- *      specify the default allocator.
- * \param len
- *      Length in bytes of the serialised message.
- * \param data
- *      Pointer to the serialised message.
- * \return
- *      An unpacked message object.
- * \retval NULL
- *      If an error occurred during unpacking.
- */
-PROTOBUF_C__API
-ProtobufCMessage *protobuf_c_message_unpack(
-    const ProtobufCMessageDescriptor *descriptor, ProtobufCAllocator *allocator,
-    size_t len, const uint8_t *data);
-
-/**
- * Free an unpacked message object.
- *
- * This function should be used to deallocate the memory used by a call to
- * protobuf_c_message_unpack().
- *
- * \param message
- *      The message object to free. May be NULL.
- * \param allocator
- *      `ProtobufCAllocator` to use for memory deallocation. May be NULL to
- *      specify the default allocator.
- */
-PROTOBUF_C__API
-void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
-                                      ProtobufCAllocator *allocator);
-
-/**
- * Check the validity of a message object.
- *
- * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present.
- * Recursively checks nested messages.
- *
- * \retval TRUE
- *      Message is valid.
- * \retval FALSE
- *      Message is invalid.
- */
-PROTOBUF_C__API
-protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *);
-
-/** Message initialiser. */
-#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
-  { descriptor, 0, NULL }
-
-/**
- * Initialise a message object from a message descriptor.
- *
- * \param descriptor
- *      Message descriptor.
- * \param message
- *      Allocated block of memory of size `descriptor->sizeof_message`.
- */
-PROTOBUF_C__API
-void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
-                             void *message);
-
-/**
- * Initialise a `ProtobufCBufferSimple` object.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)             \
-  {                                                               \
-    {protobuf_c_buffer_simple_append}, sizeof(array_of_bytes), 0, \
-        (array_of_bytes), 0, NULL                                 \
-  }
-
-/**
- * Clear a `ProtobufCBufferSimple` object, freeing any allocated memory.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
-  do {                                                                        \
-    if ((simp_buf)->must_free_data) {                                         \
-      if ((simp_buf)->allocator != NULL)                                      \
-        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
-      else                                                                    \
-        free((simp_buf)->data);                                               \
-    }                                                                         \
-  } while (0)
-
-/**
- * The `append` method for `ProtobufCBufferSimple`.
- *
- * \param buffer
- *      The buffer object to append to. Must actually be a
- *      `ProtobufCBufferSimple` object.
- * \param len
- *      Number of bytes in `data`.
- * \param data
- *      Data to append.
- */
-PROTOBUF_C__API
-void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
-                                     const unsigned char *data);
-
-/**@}*/
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_H */
diff --git a/mobile/tools/quantification/src/tensor_desc.h b/mobile/tools/quantification/src/tensor_desc.h
deleted file mode 100644
index 4eadf341db..0000000000
--- a/mobile/tools/quantification/src/tensor_desc.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include 
-
-#include "src/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-enum VarType_Type {
-  VARTYPE_TYPE_BOOL = 0,
-  VARTYPE_TYPE_INT16 = 1,
-  VARTYPE_TYPE_INT32 = 2,
-  VARTYPE_TYPE_INT64 = 3,
-  VARTYPE_TYPE_FP16 = 4,
-  VARTYPE_TYPE_FP32 = 5,
-  VARTYPE_TYPE_FP64 = 6,
-  VARTYPE_TYPE_LOD_TENSOR = 7,
-  VARTYPE_TYPE_SELECTED_ROWS = 8,
-  VARTYPE_TYPE_FEED_MINIBATCH = 9,
-  VARTYPE_TYPE_FETCH_LIST = 10,
-  VARTYPE_TYPE_STEP_SCOPES = 11,
-  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
-  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
-  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
-  VARTYPE_TYPE_READER = 15,
-  VARTYPE_TYPE_CHANNEL = 16,
-  VARTYPE_TYPE_RAW = 17,
-  VARTYPE_TYPE_TUPLE = 18
-};
-
-class TensorDesc {
- public:
-  TensorDesc() = default;
-  TensorDesc(const TensorDesc &desc) {
-    this->dims_ = desc.dims_;
-    this->data_type_ = desc.data_type_;
-  }
-
-  explicit TensorDesc(
-      PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
-    for (int i = 0; i < desc->n_dims; ++i) {
-      int64_t d = desc->dims[i];
-      dims_.emplace_back(d);
-    }
-    data_type_ = (VarType_Type)desc->data_type;
-  }
-
-  std::vector Dims() const { return dims_; }
-  VarType_Type DataType() const { return data_type_; }
-
- private:
-  std::vector dims_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/tools/quantification/src/var_desc.h b/mobile/tools/quantification/src/var_desc.h
deleted file mode 100644
index 0b9c5ac4d6..0000000000
--- a/mobile/tools/quantification/src/var_desc.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include 
-
-#include "src/framework.pb-c.h"
-#include "src/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class VarDesc {
- public:
-  VarDesc(const VarDesc &var_desc) {
-    this->data_type_ = var_desc.data_type_;
-    this->name_ = var_desc.name_;
-    this->persistable_ = var_desc.persistable_;
-    this->tensor_desc_ = var_desc.tensor_desc_;
-    this->type_ = var_desc.type_;
-  }
-  explicit VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
-    type_ = (VarType_Type)desc->type->type;
-    name_ = std::string(desc->name);
-    persistable_ = static_cast(desc->persistable);
-
-    switch (type_) {
-      case VARTYPE_TYPE_SELECTED_ROWS:
-        tensor_desc_ = TensorDesc(desc->type->selected_rows);
-        break;
-      case VARTYPE_TYPE_LOD_TENSOR:
-        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
-        break;
-      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
-        // desc->type->tensor_array->tensor->data_type;
-        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
-
-        break;
-      default:
-        break;
-    }
-    switch (type_) {
-      case VARTYPE_TYPE_CHANNEL:
-        data_type_ = (VarType_Type)desc->type->channel->data_type;
-        break;
-      default:
-        data_type_ = tensor_desc_.DataType();
-        break;
-    }
-  }
-  std::string Name() const { return name_; }
-
-  VarType_Type Type() const { return type_; }
-
-  bool Persistable() const { return persistable_; }
-
-  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
-
- private:
-  std::string name_;
-  bool persistable_;
-  TensorDesc tensor_desc_;
-  VarType_Type type_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/tools/quantification/tune_n_fold.py b/mobile/tools/quantification/tune_n_fold.py
deleted file mode 100644
index 6126a397b3..0000000000
--- a/mobile/tools/quantification/tune_n_fold.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*
-
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-
-for fold in range(100, 1001, 100):
-    print("checking fold : {}".format(fold))
-    max_entropy = sh("./quantify 1 model params {}".format(fold))
-    print("max entropy :", max_entropy, end="")
-    sh("rm -rf scripts/model")
-    sh("rm -rf scripts/quantification_model")
-    sh("cp -r model scripts/model")
-    sh("cp -r model scripts/quantification_model")
-    sh("mv params scripts/quantification_model")
-    diff = sh("cd scripts && python run.py {}".format(fold))
-    print("output diff :", diff, end="")
diff --git a/mobile/tools/shell/change_mobile_namespace.sh b/mobile/tools/shell/change_mobile_namespace.sh
deleted file mode 100755
index aaad6ac193..0000000000
--- a/mobile/tools/shell/change_mobile_namespace.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-
-# set -o xtrace
-
-extension=$1
-
-convert () {
-    perl -pi -e "s/namespace paddle_mobile/namespace paddle_mobile_${1}/g" "${2}"
-    perl -pi -e "s/paddle_mobile::/paddle_mobile_${1}::/g" "${2}"
-}
-
-revert () {
-    perl -pi -e "s/namespace paddle_mobile_[\w]*/namespace paddle_mobile/g" "${2}"
-    perl -pi -e "s/paddle_mobile_[\w]*::/paddle_mobile::/g" "${2}"
-}
-
-if [[ $2 == "revert" ]]; then
-    for file in $(find src -name "*\.*")
-    do
-        echo "reverting ${file}"
-        revert $extension $file
-    done
-    for file in $(find test -name "*\.*")
-    do
-        echo "reverting ${file}"
-        revert $extension $file
-    done
-else
-    for file in $(find src -name "*\.*")
-    do
-        echo "converting ${file}"
-        convert $extension $file
-    done
-    for file in $(find test -name "*\.*")
-    do
-        echo "converting ${file}"
-        convert $extension $file
-    done
-fi
diff --git a/mobile/tools/shell/check-bitcode.sh b/mobile/tools/shell/check-bitcode.sh
deleted file mode 100644
index a13cfac9c7..0000000000
--- a/mobile/tools/shell/check-bitcode.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/sh
-
-archs=(armv7 armv7s arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "checking bitcode in ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-            lipo -extract $arch $library -o ${library}_${arch}.a
-    done
-done
-
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        echo "checking ${library}_${arch}.a"
-        printf "\tbitcode symbol number "
-        otool -l ${library}_${arch}.a | grep bitcode | wc -l
-        # Delete intermediate files
-        rm ${library}_${arch}.a
-    done
-done
-
-echo "bitcode checking complete."
diff --git a/mobile/tools/shell/check-filename.sh b/mobile/tools/shell/check-filename.sh
deleted file mode 100644
index 53eacc8c0e..0000000000
--- a/mobile/tools/shell/check-filename.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-
-archs=(armv7 armv7s arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "checking filename in ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-        lipo $library -thin armv7 -output ${library}_${arch}.a
-    done
-done
-
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        archlib=${library}_${arch}.a
-        echo "checking $archlib"
-        mkdir tmp_check_dir
-        cp $archlib tmp_check_dir
-        cd tmp_check_dir
-        ar -x $archlib
-        ls -alh | grep $1
-        echo ""
-        cd ..
-        # Delete intermediate files
-        rm ${library}_${arch}.a
-        rm -rf tmp_check_dir
-    done
-done
-
-echo "filename checking complete."
diff --git a/mobile/tools/shell/generate-include/.gitignore b/mobile/tools/shell/generate-include/.gitignore
deleted file mode 100644
index af9eaaeff8..0000000000
--- a/mobile/tools/shell/generate-include/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-include
-include.zip
diff --git a/mobile/tools/shell/generate-include/check_include_diff.sh b/mobile/tools/shell/generate-include/check_include_diff.sh
deleted file mode 100644
index eb3dd9d1dc..0000000000
--- a/mobile/tools/shell/generate-include/check_include_diff.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env
-
-include1=$1
-include2=$2
-
-root=$(pwd)
-
-cd $include1
-list1=$(find . -name "*" | sort -n | uniq)
-cd $root
-echo "$list1" > include1.list
-
-cd $include2
-list2=$(find . -name "*" | sort -n | uniq)
-cd $root
-echo "$list2" > include2.list
-
-diff include1.list include2.list
-
-if [ "$?" = "0" ]
-then
-    echo "no diff"
-else
-    echo "has diff"
-fi
-
-rm include1.list
-rm include2.list
-
-echo "done"
diff --git a/mobile/tools/shell/generate-include/main.cpp b/mobile/tools/shell/generate-include/main.cpp
deleted file mode 100644
index 720f09f11a..0000000000
--- a/mobile/tools/shell/generate-include/main.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "io/paddle_mobile.h"
-#include "io/paddle_inference_api.h"
-
-int main() {
-    return 0;
-}
diff --git a/mobile/tools/shell/generate-include/parse.py b/mobile/tools/shell/generate-include/parse.py
deleted file mode 100644
index ba5445c68b..0000000000
--- a/mobile/tools/shell/generate-include/parse.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import sys
-
-output = ""
-for line in sys.stdin:
-    line.strip()
-    tag = "\\"
-    if tag in line:
-        index = line.index("\\")
-        line = line[:index]
-    output += line
-for line in output.split(" "):
-    line = line.strip()
-    if "/Applications" in line:
-        continue
-    if len(line) <= 0:
-        continue
-    if not line.endswith(".h"):
-        continue
-    if not line.startswith("../../../src/"):
-        continue
-    print(line[len("../../../src/"):])
diff --git a/mobile/tools/shell/generate-include/run.sh b/mobile/tools/shell/generate-include/run.sh
deleted file mode 100755
index 1af1bce416..0000000000
--- a/mobile/tools/shell/generate-include/run.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env bash
-
-rm -rf include
-
-mkdir include
-
-g++ -I../../../src/ -M main.cpp | python parse.py | xargs -I % sh -c "dirname %" | sort | uniq | xargs -I % sh -c "mkdir -p include/%"
-
-g++ -I../../../src/ -M main.cpp | python parse.py | xargs -I % sh -c "cp ../../../src/% include/%"
diff --git a/mobile/tools/shell/merge.sh b/mobile/tools/shell/merge.sh
deleted file mode 100644
index 08c19d9286..0000000000
--- a/mobile/tools/shell/merge.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/sh
-
-# Combined all static libaries in the current directory into a single static library
-# It is hardcoded to use the i386, armv7, and armv7s architectures; this can easily be changed via the 'archs' variable at the top
-# The script takes a single argument, which is the name of the final, combined library to be created.
-#
-#   For example:
-#  =>    combine_static_libraries.sh combined-library
-#
-# Script by Evan Schoenberg, Regular Rate and Rhythm Software
-# Thanks to Claudiu Ursache for his blog post at http://www.cvursache.com/2013/10/06/Combining-Multi-Arch-Binaries/ which detailed the technique automated by this script
-#####
-# $1 = Name of output archive
-#####
-
-# archs=(i386 armv7 armv7s)
-archs=(armv7 arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "Combining ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-            lipo -extract $arch $library -o ${library}_${arch}.a
-    done
-done
-
-# Combine results of the same architecture into a library for that architecture
-source_combined=""
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        source_libraries="${source_libraries} ${library}_${arch}.a"
-    done
-    
-    $libtool -static ${source_libraries} -o "${1}_${arch}.a"
-    source_combined="${source_combined} ${1}_${arch}.a"
-    
-    # Delete intermediate files
-    rm ${source_libraries}
-done
-
-# Merge the combined library for each architecture into a single fat binary
-lipo -create $source_combined -o $1.a
-
-# Delete intermediate files
-rm ${source_combined}
-
-# Show info on the output library as confirmation
-echo "Combination complete."
-lipo -info $1.a
diff --git a/mobile/tools/shell/prune_static_library.sh b/mobile/tools/shell/prune_static_library.sh
deleted file mode 100644
index 1b555e92bb..0000000000
--- a/mobile/tools/shell/prune_static_library.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-
-# Split all static libaries in the current directory into corresponding archtectures
-
-archs=(armv7 arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-rm -rf tmp
-mkdir tmp
-
-echo "splitting and pruning ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-        mkdir -p tmp/$arch
-        lipo -thin $arch $library -o ./tmp/$arch/${library}
-        cd tmp/$arch
-        ar x $library
-        rm $library
-        ar -rcs $library *.o
-        cd ../..
-    done
-done
-
-echo "joining static libriries..."
-cd tmp
-libtool -static -o $library armv7/$library arm64/$library
-
-# # split static library into objects
-# ar x 1.a
-# # join objects into static library
-# ar -rcs 2.a *.o
-# # join static libraries into one single static library
-# libtool -static -o 3.a 1.a 2.a
-# # list file by file size, prune according to file size
-# ls -Slhr directory
diff --git a/mobile/tools/shell/restore-private-repo.sh b/mobile/tools/shell/restore-private-repo.sh
deleted file mode 100644
index d9d29ed3e5..0000000000
--- a/mobile/tools/shell/restore-private-repo.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-
-git clone https://icode.baidu.com/baidu/bdbox/paddle-mobile-private-repo/
-
-cp -R paddle-mobile-private-repo/paddle-mobile-metallib ../../metal/
diff --git a/mobile/tools/toolchains/arm-android-neon.cmake b/mobile/tools/toolchains/arm-android-neon.cmake
deleted file mode 100644
index 5e431059a9..0000000000
--- a/mobile/tools/toolchains/arm-android-neon.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-set(ANDROID_ARM_NEON ON)
-set(ANDROID_PIE TRUE)
-set(ANDROID_STL "c++_static")
-set(ANDROID_PLATFORM "android-22")
-include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
diff --git a/mobile/tools/toolchains/arm-linux-gnueabi.cmake b/mobile/tools/toolchains/arm-linux-gnueabi.cmake
deleted file mode 100644
index c2b1b853de..0000000000
--- a/mobile/tools/toolchains/arm-linux-gnueabi.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMake toolchain file for building ARM software on Linux environment
-
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_VERSION 1)
-
-set(CMAKE_C_COMPILER   /usr/bin/arm-linux-gnueabi-gcc)
-set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
-set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
-
-set(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-set(ARM_LINUX 1)
diff --git a/mobile/tools/toolchains/arm-linux-gnueabihf.cmake b/mobile/tools/toolchains/arm-linux-gnueabihf.cmake
deleted file mode 100644
index 2b8729cd9d..0000000000
--- a/mobile/tools/toolchains/arm-linux-gnueabihf.cmake
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMake toolchain file for building ARM software on Linux environment
-
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR arm)
-set(CMAKE_SYSTEM_VERSION 1)
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-- 
GitLab