diff --git a/README.md b/README.md index 83d0a986da1d73151b8915ec60e5aa2f711837b5..22b84888294b5ef60c3d91d7a7909aef8f601d81 100644 --- a/README.md +++ b/README.md @@ -1 +1,74 @@ -编译方法: ./lite/tools/build_bm.sh --target_name=bm --bm_sdk_root=/Paddle-Lite/third-party/bmnnsdk2-bm1684_v2.0.1 bm +[中文版](./README_cn.md) + +# Paddle Lite + + +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/) +[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) + + + +Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources. + +For tutorials, please see [PaddleLite Document](https://paddlepaddle.github.io/Paddle-Lite/). + +## Key Features + +### Light Weight + +On mobile devices, execution module can be deployed without third-party libraries, because our excecution module and analysis module are decoupled. + +On ARM V7, only 800KB are taken up, while on ARM V8, 1.3MB are taken up with the 80 operators and 85 kernels in the dynamic libraries provided by Paddle Lite. + +Paddle Lite enables immediate inference without extra optimization. + +### High Performance + +Paddle Lite enables device-optimized kernels, maximizing ARM CPU performance. + +It also supports INT8 quantizations with [PaddleSlim model compression tools](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim), reducing the size of models and increasing the performance of models. + +On Huawei NPU and FPGA, the performance is also boosted. + +The latest benchmark is located at [benchmark](https://paddlepaddle.github.io/Paddle-Lite/develop/benchmark/) + +### High Compatibility + +Hardware compatibility: Paddle Lite supports a diversity of hardwares — ARM CPU, Mali GPU, Adreno GPU, Huawei NPU and FPGA. In the near future, we will also support AI microchips from Cambricon and Bitmain. + +Model compatibility: The Op of Paddle Lite is fully compatible to that of PaddlePaddle. The accuracy and performance of 18 models (mostly CV models and OCR models) and 85 operators have been validated. In the future, we will also support other models. + +Framework compatibility: In addition to models trained on PaddlePaddle, those trained on Caffe and TensorFlow can also be converted to be used on Paddle Lite, via [X2Paddle](https://github.com/PaddlePaddle/X2Paddle). In the future to come, we will also support models of ONNX format. + +## Architecture + +Paddle Lite is designed to support a wide range of hardwares and devices, and it enables mixed execution of a single model on multiple devices, optimization on various phases, and leight-weighted applications on devices. + +![img](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png) + +As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal exevution, so it can be deployed on its own to ensure maximized light-weighted deployment. + +## Key Info about the Update + +The earlier Paddle-Mobile was designed to be compatible with PaddlePaddle and multiple hardwares, including ARM CPU, Mali GPU, Adreno GPU, FPGA, ARM-Linux and Apple's GPU Metal. Within Baidu, inc, many product lines have been using Paddle-Mobile. For more details, please see: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md). + +As an update of Paddle-Mobile, Paddle Lite has incorporated many older capabilities into the [new architecture](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite). For the time being, the code of Paddle-mobile will be kept under the directory `mobile/`, before complete transfer to Paddle Lite. + +For demands of Apple's GPU Metal and web front end inference, please see `./metal` and `./web` . These two modules will be further developed and maintained. + +## Special Thanks + +Paddle Lite has referenced the following open-source projects: + +- [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29) +- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. + + +## Feedback and Community Support + +- Questions, reports, and suggestions are welcome through Github Issues! +- Forum: Opinions and questions are welcome at our [PaddlePaddle Forum](https://ai.baidu.com/forum/topic/list/168)! +- WeChat Official Account: PaddlePaddle +- QQ Group Chat: 696965088 +

     

+

  WeChat Official Account           QQ Group Chat     

diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake index 76f62765aff791594123d689341b0876b3d0184d..0597ef0cc4ba4c0bcec172c767d66d0f362e1459 100644 --- a/cmake/cross_compiling/ios.cmake +++ b/cmake/cross_compiling/ios.cmake @@ -120,6 +120,7 @@ # ## Lite settings +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto") if (ARM_TARGET_OS STREQUAL "ios") set(PLATFORM "OS") elseif(ARM_TARGET_OS STREQUAL "ios64") diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake index 25aa4d2bc8c1c145e7a103c9164e1c9e231a8f9e..c22bb1db4fbf8a7370ff3e7c9aca40cc94d550a2 100644 --- a/cmake/cross_compiling/npu.cmake +++ b/cmake/cross_compiling/npu.cmake @@ -30,7 +30,7 @@ if(NOT NPU_DDK_INC) message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include") endif() -include_directories("${NPU_DDK_ROOT}") +include_directories("${NPU_DDK_ROOT}/include") set(NPU_SUB_LIB_PATH "lib64") if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index c053d4ec2bd72258438694143fd08957cd0d35c0..cb6a872e061a51f142bd2301171f0559a1ccb129 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -224,10 +224,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) @@ -239,10 +243,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" ) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index d57496487a2bb2756f6755916b2761c04aa626d5..a1fde4c152c003e3b1adcea77aa78446ba7a1df5 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -35,6 +35,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE NPU_DEPS ${npu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) + if (LITE_WITH_NPU) # Strips the symbols of our protobuf functions to fix the conflicts during # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so) @@ -45,8 +46,8 @@ else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) - set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") - add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) + set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) @@ -91,6 +92,7 @@ if (NOT LITE_ON_TINY_PUBLISH) SRCS cxx_api.cc DEPS ${cxx_api_deps} ${ops} ${host_kernels} program X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels} ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} @@ -129,7 +131,9 @@ if(WITH_TESTING) DEPS cxx_api mir_passes lite_api_test_helper ${ops} ${host_kernels} X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} @@ -293,12 +297,13 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) message(STATUS "Compiling model_optimize_tool") lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS gflags kernel op optimizer mir_passes utils) - add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc) + add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h) endif(LITE_ON_MODEL_OPTIMIZE_TOOL) lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light ${ops} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} @@ -327,13 +332,14 @@ if(NOT IOS) lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) - lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils + lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index 990d08f18f541088d797510e9dbd4881d42b164f..c1e9fc422450adf96d62c68d622907bd7e15b405 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -201,7 +201,11 @@ void Predictor::Build(const lite_api::CxxConfig &config, const std::string &model_file = config.model_file(); const std::string ¶m_file = config.param_file(); const bool model_from_memory = config.model_from_memory(); - LOG(INFO) << "load from memory " << model_from_memory; + if (model_from_memory) { + LOG(INFO) << "Load model from memory."; + } else { + LOG(INFO) << "Load model from file."; + } Build(model_path, model_file, diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 3e6e10103e9f3af51923459a5921f9781431f352..81ea60eac66849f8ce42fb8cb210226d18bbfa9b 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -42,11 +42,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) - int num_threads = config.cpu_math_library_num_threads(); + int num_threads = config.x86_math_library_num_threads(); int real_num_threads = num_threads > 1 ? num_threads : 1; paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads); omp_set_num_threads(real_num_threads); - VLOG(3) << "set_cpu_math_library_math_threads() is set successfully and the " + VLOG(3) << "set_x86_math_library_math_threads() is set successfully and the " "number of threads is:" << num_threads; #endif diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc old mode 100755 new mode 100644 diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc index b678c7ecd24c5ffbf3e9e3531264ac195c6a7325..fc23e0b54be41bff5b7b65b4e58908546b186bb4 100644 --- a/lite/api/model_optimize_tool.cc +++ b/lite/api/model_optimize_tool.cc @@ -16,8 +16,9 @@ #ifdef PADDLE_WITH_TESTING #include #endif -// "all_kernel_faked.cc" and "kernel_src_map.h" are created automatically during -// model_optimize_tool's compiling period +// "supported_kernel_op_info.h", "all_kernel_faked.cc" and "kernel_src_map.h" +// are created automatically during model_optimize_tool's compiling period +#include #include "all_kernel_faked.cc" // NOLINT #include "kernel_src_map.h" // NOLINT #include "lite/api/cxx_api.h" @@ -25,8 +26,11 @@ #include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_passes.h" #include "lite/core/op_registry.h" +#include "lite/model_parser/compatible_pb.h" +#include "lite/model_parser/pb/program_desc.h" #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" +#include "supported_kernel_op_info.h" // NOLINT DEFINE_string(model_dir, "", @@ -62,10 +66,16 @@ DEFINE_string(valid_targets, "The targets this model optimized for, should be one of (arm, " "opencl, x86), splitted by space"); DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); +DEFINE_bool(print_supported_ops, + false, + "Print supported operators on the inputed target"); +DEFINE_bool(print_all_ops, + false, + "Print all the valid operators of Paddle-Lite"); +DEFINE_bool(print_model_ops, false, "Print operators in the input model"); namespace paddle { namespace lite_api { - //! Display the kernel information. void DisplayKernels() { LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString(); @@ -130,9 +140,7 @@ void RunOptimize(const std::string& model_dir, config.set_model_dir(model_dir); config.set_model_file(model_file); config.set_param_file(param_file); - config.set_valid_places(valid_places); - auto predictor = lite_api::CreatePaddlePredictor(config); LiteModelType model_type; @@ -168,6 +176,202 @@ void CollectModelMetaInfo(const std::string& output_dir, lite::WriteLines(std::vector(total.begin(), total.end()), output_path); } +void PrintOpsInfo(std::set valid_ops = {}) { + std::vector targets = {"kHost", + "kX86", + "kCUDA", + "kARM", + "kOpenCL", + "kFPGA", + "kNPU", + "kXPU", + "kAny", + "kUnk"}; + int maximum_optype_length = 0; + for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { + maximum_optype_length = it->first.size() > maximum_optype_length + ? it->first.size() + : maximum_optype_length; + } + std::cout << std::setiosflags(std::ios::internal); + std::cout << std::setw(maximum_optype_length) << "OP_name"; + for (int i = 0; i < targets.size(); i++) { + std::cout << std::setw(10) << targets[i].substr(1); + } + std::cout << std::endl; + if (valid_ops.empty()) { + for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { + std::cout << std::setw(maximum_optype_length) << it->first; + auto ops_valid_places = it->second; + for (int i = 0; i < targets.size(); i++) { + if (std::find(ops_valid_places.begin(), + ops_valid_places.end(), + targets[i]) != ops_valid_places.end()) { + std::cout << std::setw(10) << "Y"; + } else { + std::cout << std::setw(10) << " "; + } + } + std::cout << std::endl; + } + } else { + for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) { + std::cout << std::setw(maximum_optype_length) << *op; + // Check: If this kernel doesn't match any operator, we will skip it. + if (supported_ops.find(*op) == supported_ops.end()) { + continue; + } + // Print OP info. + auto ops_valid_places = supported_ops.at(*op); + for (int i = 0; i < targets.size(); i++) { + if (std::find(ops_valid_places.begin(), + ops_valid_places.end(), + targets[i]) != ops_valid_places.end()) { + std::cout << std::setw(10) << "Y"; + } else { + std::cout << std::setw(10) << " "; + } + } + std::cout << std::endl; + } + } +} +/// Print help information +void PrintHelpInfo() { + // at least one argument should be inputed + const char help_info[] = + "At least one argument should be inputed. Valid arguments are listed " + "below:\n" + " Arguments of model optimization:\n" + " `--model_dir=`\n" + " `--model_file=`\n" + " `--param_file=`\n" + " `--optimize_out_type=(protobuf|naive_buffer)`\n" + " `--optimize_out=`\n" + " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" + " `--prefer_int8_kernel=(true|false)`\n" + " `--record_tailoring_info=(true|false)`\n" + " Arguments of model checking and ops information:\n" + " `--print_all_ops=true` Display all the valid operators of " + "Paddle-Lite\n" + " `--print_supported_ops=true " + "--valid_targets=(arm|opencl|x86|npu|xpu)`" + " Display valid operators of input targets\n" + " `--print_model_ops=true --model_dir= " + "--valid_targets=(arm|opencl|x86|npu|xpu)`" + " Display operators in the input model\n"; + std::cout << help_info << std::endl; + exit(1); +} + +// Parse Input command +void ParseInputCommand() { + if (FLAGS_print_all_ops) { + std::cout << "All OPs supported by Paddle-Lite: " << supported_ops.size() + << " ops in total." << std::endl; + PrintOpsInfo(); + exit(1); + } else if (FLAGS_print_supported_ops) { + auto valid_places = paddle::lite_api::ParserValidPlaces(); + // get valid_targets string + std::vector target_types = {}; + for (int i = 0; i < valid_places.size(); i++) { + target_types.push_back(valid_places[i].target); + } + std::string targets_str = TargetToStr(target_types[0]); + for (int i = 1; i < target_types.size(); i++) { + targets_str = targets_str + TargetToStr(target_types[i]); + } + + std::cout << "Supported OPs on '" << targets_str << "': " << std::endl; + target_types.push_back(TARGET(kHost)); + target_types.push_back(TARGET(kUnk)); + + std::set valid_ops; + for (int i = 0; i < target_types.size(); i++) { + auto ops = supported_ops_target[static_cast(target_types[i])]; + valid_ops.insert(ops.begin(), ops.end()); + } + PrintOpsInfo(valid_ops); + exit(1); + } +} +// test whether this model is supported +void CheckIfModelSupported() { + // 1. parse valid places and valid targets + auto valid_places = paddle::lite_api::ParserValidPlaces(); + // set valid_ops + auto valid_ops = supported_ops_target[static_cast(TARGET(kHost))]; + auto valid_unktype_ops = supported_ops_target[static_cast(TARGET(kUnk))]; + valid_ops.insert( + valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); + for (int i = 0; i < valid_places.size(); i++) { + auto target = valid_places[i].target; + auto ops = supported_ops_target[static_cast(target)]; + valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); + } + // get valid ops + std::set valid_ops_set(valid_ops.begin(), valid_ops.end()); + + // 2.Load model into program to get ops in model + std::string prog_path = FLAGS_model_dir + "/__model__"; + if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) { + prog_path = FLAGS_model_file; + } + lite::cpp::ProgramDesc cpp_prog; + framework::proto::ProgramDesc pb_proto_prog = + *lite::LoadProgram(prog_path, false); + lite::pb::ProgramDesc pb_prog(&pb_proto_prog); + // Transform to cpp::ProgramDesc + lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog); + + std::set unsupported_ops; + std::set input_model_ops; + for (int index = 0; index < cpp_prog.BlocksSize(); index++) { + auto current_block = cpp_prog.GetBlock(index); + for (size_t i = 0; i < current_block->OpsSize(); ++i) { + auto& op_desc = *current_block->GetOp(i); + auto op_type = op_desc.Type(); + input_model_ops.insert(op_type); + if (valid_ops_set.count(op_type) == 0) { + unsupported_ops.insert(op_type); + } + } + } + // 3. Print ops_info of input model and check if this model is supported + if (FLAGS_print_model_ops) { + std::cout << "OPs in the input model include:\n"; + PrintOpsInfo(input_model_ops); + } + if (!unsupported_ops.empty()) { + std::string unsupported_ops_str = *unsupported_ops.begin(); + for (auto op_str = ++unsupported_ops.begin(); + op_str != unsupported_ops.end(); + op_str++) { + unsupported_ops_str = unsupported_ops_str + ", " + *op_str; + } + std::vector targets = {}; + for (int i = 0; i < valid_places.size(); i++) { + targets.push_back(valid_places[i].target); + } + std::sort(targets.begin(), targets.end()); + targets.erase(unique(targets.begin(), targets.end()), targets.end()); + std::string targets_str = TargetToStr(targets[0]); + for (int i = 1; i < targets.size(); i++) { + targets_str = targets_str + "," + TargetToStr(targets[i]); + } + + LOG(ERROR) << "Error: This model is not supported, because " + << unsupported_ops.size() << " ops are not supported on '" + << targets_str << "'. These unsupported ops are: '" + << unsupported_ops_str << "'."; + exit(1); + } + if (FLAGS_print_model_ops) { + std::cout << "Paddle-Lite supports this model!" << std::endl; + exit(1); + } +} void Main() { if (FLAGS_display_kernels) { @@ -241,7 +445,13 @@ void Main() { } // namespace paddle int main(int argc, char** argv) { + // If there is none input argument, print help info. + if (argc < 2) { + paddle::lite_api::PrintHelpInfo(); + } google::ParseCommandLineFlags(&argc, &argv, false); + paddle::lite_api::ParseInputCommand(); + paddle::lite_api::CheckIfModelSupported(); paddle::lite_api::Main(); return 0; } diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index dc9fac96ee848d73ca14c8dc4555c0f44951400a..5b063a8ef19c85d3818d2ca57659170d7d86357d 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -86,6 +86,7 @@ void Run(const std::vector>& input_shapes, for (int i = 0; i < input_shapes[j].size(); ++i) { input_num *= input_shapes[j][i]; } + for (int i = 0; i < input_num; ++i) { input_data[i] = 1.f; } diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index a014719c5783bcec3988bba65c53cc8cf52f0b4c..6308699ac91900d161a55ee121e4d9777947fede 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -133,7 +133,9 @@ class LITE_API CxxConfig : public ConfigBase { std::string model_file_; std::string param_file_; bool model_from_memory_{false}; - int cpu_math_library_math_threads_ = 1; +#ifdef LITE_WITH_X86 + int x86_math_library_math_threads_ = 1; +#endif public: void set_valid_places(const std::vector& x) { valid_places_ = x; } @@ -153,12 +155,14 @@ class LITE_API CxxConfig : public ConfigBase { std::string param_file() const { return param_file_; } bool model_from_memory() const { return model_from_memory_; } - void set_cpu_math_library_num_threads(int threads) { - cpu_math_library_math_threads_ = threads; +#ifdef LITE_WITH_X86 + void set_x86_math_library_num_threads(int threads) { + x86_math_library_math_threads_ = threads; } - int cpu_math_library_num_threads() const { - return cpu_math_library_math_threads_; + int x86_math_library_num_threads() const { + return x86_math_library_math_threads_; } +#endif }; /// MobileConfig is the config for the light weight predictor, it will skip diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc index 075d314df6f46ab9dc8531b26c23d05d24e63bb4..013fd82b19bc22ace22184389249a7b2d9bf237e 100644 --- a/lite/api/test_step_rnn_lite_x86.cc +++ b/lite/api/test_step_rnn_lite_x86.cc @@ -30,7 +30,9 @@ TEST(Step_rnn, test_step_rnn_lite_x86) { std::string model_dir = FLAGS_model_dir; lite_api::CxxConfig config; config.set_model_dir(model_dir); - config.set_cpu_math_library_num_threads(1); +#ifdef LITE_WITH_X86 + config.set_x86_math_library_num_threads(1); +#endif config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)}, lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); diff --git a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc deleted file mode 100644 index 99aeea8bdea2a50795dcdca18464a196ee877291..0000000000000000000000000000000000000000 --- a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc +++ /dev/null @@ -1,538 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -void conv_3x3s1_depthwise_fp32(const float* i_data, - float* o_data, - int bs, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - int threads = ctx->threads(); - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; - const int out_c_block = 4; - const int out_h_kernel = 2; - const int out_w_kernel = 4; - const int win_ext = ow + 2; - const int ow_round = ROUNDUP(ow, 4); - const int win_round = ROUNDUP(win_ext, 4); - const int hin_round = oh + 2; - const int prein_size = win_round * hin_round * out_c_block; - auto workspace_size = - threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; - ctx->ExtendWorkspace(sizeof(float) * workspace_size); - - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - - /// get workspace - float* ptr_zero = ctx->workspace_data(); - memset(ptr_zero, 0, sizeof(float) * win_round); - float* ptr_write = ptr_zero + win_round; - - int size_in_channel = win * ih; - int size_out_channel = ow * oh; - - int ws = -pad_w; - int we = ws + win_round; - int hs = -pad_h; - int he = hs + hin_round; - int w_loop = ow_round / 4; - auto remain = w_loop * 4 - ow; - bool flag_remain = remain > 0; - remain = 4 - remain; - remain = remain > 0 ? remain : 0; - int row_len = win_round * out_c_block; - - for (int n = 0; n < bs; ++n) { - const float* din_batch = i_data + n * ic * size_in_channel; - float* dout_batch = o_data + n * oc * size_out_channel; -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < oc; c += out_c_block) { -#ifdef ARM_WITH_OMP - float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size; -#else - float* pre_din = ptr_write + ow_round; -#endif - /// const array size - float pre_out[out_c_block * out_w_kernel * out_h_kernel]; // NOLINT - prepack_input_nxwc4_dw( - din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero); - const float* weight_c = weights + c * 9; // kernel_w * kernel_h - float* dout_c00 = dout_batch + c * size_out_channel; - float bias_local[4] = {0, 0, 0, 0}; - if (flag_bias) { - bias_local[0] = bias[c]; - bias_local[1] = bias[c + 1]; - bias_local[2] = bias[c + 2]; - bias_local[3] = bias[c + 3]; - } - float32x4_t vbias = vld1q_f32(bias_local); -#ifdef __aarch64__ - float32x4_t w0 = vld1q_f32(weight_c); // w0, v23 - float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24 - float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25 - float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26 - float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27 - float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28 - float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29 - float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30 - float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31 -#endif - for (int h = 0; h < oh; h += out_h_kernel) { - float* outc00 = dout_c00 + h * ow; - float* outc01 = outc00 + ow; - float* outc10 = outc00 + size_out_channel; - float* outc11 = outc10 + ow; - float* outc20 = outc10 + size_out_channel; - float* outc21 = outc20 + ow; - float* outc30 = outc20 + size_out_channel; - float* outc31 = outc30 + ow; - const float* inr0 = pre_din + h * row_len; - const float* inr1 = inr0 + row_len; - const float* inr2 = inr1 + row_len; - const float* inr3 = inr2 + row_len; - if (c + out_c_block > oc) { - switch (c + out_c_block - oc) { - case 3: - outc10 = ptr_write; - outc11 = ptr_write; - case 2: - outc20 = ptr_write; - outc21 = ptr_write; - case 1: - outc30 = ptr_write; - outc31 = ptr_write; - default: - break; - } - } - if (h + out_h_kernel > oh) { - outc01 = ptr_write; - outc11 = ptr_write; - outc21 = ptr_write; - outc31 = ptr_write; - } - float* outl[] = {outc00, - outc10, - outc20, - outc30, - outc01, - outc11, - outc21, - outc31, - reinterpret_cast(bias_local), - reinterpret_cast(flag_relu)}; - void* outl_ptr = reinterpret_cast(outl); - for (int w = 0; w < w_loop; ++w) { - bool flag_mask = (w == w_loop - 1) && flag_remain; - float* out0 = pre_out; -// clang-format off -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ - "ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ - "ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ - "ldp q8, q9, [%[inr1]], #32\n" /* load input r1*/ - "ldp q4, q5, [%[inr0]]\n" /* load input r0*/ - "ldp q10, q11, [%[inr1]]\n" /* load input r1*/ - /* r0, r1, mul w0, get out r0, r1 */ - "fmul v15.4s , %[w0].4s, v0.4s\n" /* outr00 = w0 * r0, 0*/ - "fmul v16.4s , %[w0].4s, v1.4s\n" /* outr01 = w0 * r0, 1*/ - "fmul v17.4s , %[w0].4s, v2.4s\n" /* outr02 = w0 * r0, 2*/ - "fmul v18.4s , %[w0].4s, v3.4s\n" /* outr03 = w0 * r0, 3*/ - "fmul v19.4s , %[w0].4s, v6.4s\n" /* outr10 = w0 * r1, 0*/ - "fmul v20.4s , %[w0].4s, v7.4s\n" /* outr11 = w0 * r1, 1*/ - "fmul v21.4s , %[w0].4s, v8.4s\n" /* outr12 = w0 * r1, 2*/ - "fmul v22.4s , %[w0].4s, v9.4s\n" /* outr13 = w0 * r1, 3*/ - /* r0, r1, mul w1, get out r0, r1 */ - "fmla v15.4s , %[w1].4s, v1.4s\n" /* outr00 = w1 * r0[1]*/ - "ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ - "fmla v16.4s , %[w1].4s, v2.4s\n" /* outr01 = w1 * r0[2]*/ - "fmla v17.4s , %[w1].4s, v3.4s\n" /* outr02 = w1 * r0[3]*/ - "fmla v18.4s , %[w1].4s, v4.4s\n" /* outr03 = w1 * r0[4]*/ - "fmla v19.4s , %[w1].4s, v7.4s\n" /* outr10 = w1 * r1[1]*/ - "fmla v20.4s , %[w1].4s, v8.4s\n" /* outr11 = w1 * r1[2]*/ - "fmla v21.4s , %[w1].4s, v9.4s\n" /* outr12 = w1 * r1[3]*/ - "fmla v22.4s , %[w1].4s, v10.4s\n"/* outr13 = w1 * r1[4]*/ - /* r0, r1, mul w2, get out r0, r1 */ - "fmla v15.4s , %[w2].4s, v2.4s\n" /* outr00 = w2 * r0[2]*/ - "fmla v16.4s , %[w2].4s, v3.4s\n" /* outr01 = w2 * r0[3]*/ - "ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ - "fmla v17.4s , %[w2].4s, v4.4s\n" /* outr02 = w2 * r0[4]*/ - "fmla v18.4s , %[w2].4s, v5.4s\n" /* outr03 = w2 * r0[5]*/ - "ldp q4, q5, [%[inr2]]\n" /* load input r2*/ - "fmla v19.4s , %[w2].4s, v8.4s\n" /* outr10 = w2 * r1[2]*/ - "fmla v20.4s , %[w2].4s, v9.4s\n" /* outr11 = w2 * r1[3]*/ - "fmla v21.4s , %[w2].4s, v10.4s\n"/* outr12 = w2 * r1[4]*/ - "fmla v22.4s , %[w2].4s, v11.4s\n"/* outr13 = w2 * r1[5]*/ - /* r1, r2, mul w3, get out r0, r1 */ - "fmla v15.4s , %[w3].4s, v6.4s\n" /* outr00 = w3 * r1[0]*/ - "fmla v16.4s , %[w3].4s, v7.4s\n" /* outr01 = w3 * r1[1]*/ - "fmla v17.4s , %[w3].4s, v8.4s\n" /* outr02 = w3 * r1[2]*/ - "fmla v18.4s , %[w3].4s, v9.4s\n" /* outr03 = w3 * r1[3]*/ - "fmla v19.4s , %[w3].4s, v0.4s\n" /* outr10 = w3 * r2[0]*/ - "fmla v20.4s , %[w3].4s, v1.4s\n" /* outr11 = w3 * r2[1]*/ - "fmla v21.4s , %[w3].4s, v2.4s\n" /* outr12 = w3 * r2[2]*/ - "fmla v22.4s , %[w3].4s, v3.4s\n" /* outr13 = w3 * r2[3]*/ - /* r1, r2, mul w4, get out r0, r1 */ - "fmla v15.4s , %[w4].4s, v7.4s\n" /* outr00 = w4 * r1[1]*/ - "ldp q6, q7, [%[inr3]], #32\n" /* load input r3*/ - "fmla v16.4s , %[w4].4s, v8.4s\n" /* outr01 = w4 * r1[2]*/ - "fmla v17.4s , %[w4].4s, v9.4s\n" /* outr02 = w4 * r1[3]*/ - "fmla v18.4s , %[w4].4s, v10.4s\n"/* outr03 = w4 * r1[4]*/ - "ldp x0, x1, [%[outl]] \n" - "fmla v19.4s , %[w4].4s, v1.4s\n" /* outr10 = w4 * r2[1]*/ - "fmla v20.4s , %[w4].4s, v2.4s\n" /* outr11 = w4 * r2[2]*/ - "fmla v21.4s , %[w4].4s, v3.4s\n" /* outr12 = w4 * r2[3]*/ - "fmla v22.4s , %[w4].4s, v4.4s\n" /* outr13 = w4 * r2[4]*/ - /* r1, r2, mul w5, get out r0, r1 */ - "fmla v15.4s , %[w5].4s, v8.4s\n" /* outr00 = w5 * r1[2]*/ - "fmla v16.4s , %[w5].4s, v9.4s\n" /* outr01 = w5 * r1[3]*/ - "ldp q8, q9, [%[inr3]], #32\n" /* load input r3*/ - "fmla v17.4s , %[w5].4s, v10.4s\n"/* outr02 = w5 * r1[4]*/ - "fmla v18.4s , %[w5].4s, v11.4s\n"/* outr03 = w5 * r1[5]*/ - "ldp q10, q11, [%[inr3]]\n" /* load input r3*/ - "fmla v19.4s , %[w5].4s, v2.4s\n" /* outr10 = w5 * r2[2]*/ - "fmla v20.4s , %[w5].4s, v3.4s\n" /* outr11 = w5 * r2[3]*/ - "fmla v21.4s , %[w5].4s, v4.4s\n" /* outr12 = w5 * r2[4]*/ - "fmla v22.4s , %[w5].4s, v5.4s\n" /* outr13 = w5 * r2[5]*/ - /* r2, r3, mul w6, get out r0, r1 */ - "fmla v15.4s , %[w6].4s, v0.4s\n" /* outr00 = w6 * r2[0]*/ - "fmla v16.4s , %[w6].4s, v1.4s\n" /* outr01 = w6 * r2[1]*/ - "fmla v17.4s , %[w6].4s, v2.4s\n" /* outr02 = w6 * r2[2]*/ - "fmla v18.4s , %[w6].4s, v3.4s\n" /* outr03 = w6 * r2[3]*/ - "ldp x2, x3, [%[outl], #16] \n" - "fmla v19.4s , %[w6].4s, v6.4s\n" /* outr10 = w6 * r3[0]*/ - "fmla v20.4s , %[w6].4s, v7.4s\n" /* outr11 = w6 * r3[1]*/ - "fmla v21.4s , %[w6].4s, v8.4s\n" /* outr12 = w6 * r3[2]*/ - "fmla v22.4s , %[w6].4s, v9.4s\n" /* outr13 = w6 * r3[3]*/ - /* r2, r3, mul w7, get out r0, r1 */ - "fmla v15.4s , %[w7].4s, v1.4s\n" /* outr00 = w7 * r2[1]*/ - "fmla v16.4s , %[w7].4s, v2.4s\n" /* outr01 = w7 * r2[2]*/ - "fmla v17.4s , %[w7].4s, v3.4s\n" /* outr02 = w7 * r2[3]*/ - "fmla v18.4s , %[w7].4s, v4.4s\n" /* outr03 = w7 * r2[4]*/ - "ldp x4, x5, [%[outl], #32] \n" - "fmla v19.4s , %[w7].4s, v7.4s\n" /* outr10 = w7 * r3[1]*/ - "fmla v20.4s , %[w7].4s, v8.4s\n" /* outr11 = w7 * r3[2]*/ - "fmla v21.4s , %[w7].4s, v9.4s\n" /* outr12 = w7 * r3[3]*/ - "fmla v22.4s , %[w7].4s, v10.4s\n"/* outr13 = w7 * r3[4]*/ - /* r2, r3, mul w8, get out r0, r1 */ - "fmla v15.4s , %[w8].4s, v2.4s\n" /* outr00 = w8 * r2[2]*/ - "fmla v16.4s , %[w8].4s, v3.4s\n" /* outr01 = w8 * r2[3]*/ - "fmla v17.4s , %[w8].4s, v4.4s\n" /* outr02 = w8 * r2[0]*/ - "fmla v18.4s , %[w8].4s, v5.4s\n" /* outr03 = w8 * r2[1]*/ - "ldp x6, x7, [%[outl], #48] \n" - "fmla v19.4s , %[w8].4s, v8.4s\n" /* outr10 = w8 * r3[2]*/ - "fmla v20.4s , %[w8].4s, v9.4s\n" /* outr11 = w8 * r3[3]*/ - "fmla v21.4s , %[w8].4s, v10.4s\n"/* outr12 = w8 * r3[0]*/ - "fmla v22.4s , %[w8].4s, v11.4s\n"/* outr13 = w8 * r3[1]*/ - - "fadd v15.4s, v15.4s, %[vbias].4s\n"/* add bias */ - "fadd v16.4s, v16.4s, %[vbias].4s\n"/* add bias */ - "fadd v17.4s, v17.4s, %[vbias].4s\n"/* add bias */ - "fadd v18.4s, v18.4s, %[vbias].4s\n"/* add bias */ - "fadd v19.4s, v19.4s, %[vbias].4s\n"/* add bias */ - "fadd v20.4s, v20.4s, %[vbias].4s\n"/* add bias */ - "fadd v21.4s, v21.4s, %[vbias].4s\n"/* add bias */ - "fadd v22.4s, v22.4s, %[vbias].4s\n"/* add bias */ - - /* transpose */ - "trn1 v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/ - "trn2 v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/ - "trn1 v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/ - "trn2 v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/ - "trn1 v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/ - "trn2 v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/ - "trn1 v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/ - "trn2 v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/ - "trn1 v15.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ - "trn2 v19.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ - "trn1 v17.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ - "trn2 v21.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/ - "trn1 v16.2d, v4.2d, v6.2d\n" /* r1: a0a1a2a3*/ - "trn2 v20.2d, v4.2d, v6.2d\n" /* r1: c0c1c2c3*/ - "trn1 v18.2d, v5.2d, v7.2d\n" /* r1: b0b1b2b3*/ - "trn2 v22.2d, v5.2d, v7.2d\n" /* r1: d0d1d2d3*/ - - "cbz %w[flag_relu], 0f\n" /* skip relu*/ - "movi v0.4s, #0\n" /* for relu */ - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "0:\n" - "cbnz %w[flag_mask], 1f\n" - "str q15, [x0]\n" /* save outc00 */ - "str q16, [x4]\n" /* save outc01 */ - "str q17, [x1]\n" /* save outc10 */ - "str q18, [x5]\n" /* save outc11 */ - "str q19, [x2]\n" /* save outc20 */ - "str q20, [x6]\n" /* save outc21 */ - "str q21, [x3]\n" /* save outc30 */ - "str q22, [x7]\n" /* save outc31 */ - "b 2f\n" - "1:\n" - "str q15, [%[out]], #16 \n" /* save remain to pre_out */ - "str q17, [%[out]], #16 \n" /* save remain to pre_out */ - "str q19, [%[out]], #16 \n" /* save remain to pre_out */ - "str q21, [%[out]], #16 \n" /* save remain to pre_out */ - "str q16, [%[out]], #16 \n" /* save remain to pre_out */ - "str q18, [%[out]], #16 \n" /* save remain to pre_out */ - "str q20, [%[out]], #16 \n" /* save remain to pre_out */ - "str q22, [%[out]], #16 \n" /* save remain to pre_out */ - "2:\n" - :[inr0] "+r"(inr0), [inr1] "+r"(inr1), - [inr2] "+r"(inr2), [inr3] "+r"(inr3), - [out]"+r"(out0) - :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), - [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5), - [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8), - [vbias]"w" (vbias), [outl] "r" (outl_ptr), - [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu) - : "cc", "memory", - "v0","v1","v2","v3","v4","v5","v6","v7", - "v8", "v9", "v10", "v11", "v15", - "v16","v17","v18","v19","v20","v21","v22", - "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7" - ); -#else - asm volatile( - /* load weights */ - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, w1, to q5, q6\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, to q7\n" - /* load r0, r1 */ - "vld1.32 {d0-d3}, [%[r0]]! @ load r0, q0, q1\n" - "vld1.32 {d4-d7}, [%[r0]]! @ load r0, q2, q3\n" - /* main loop */ - "0: @ main loop\n" - /* mul r0 with w0, w1, w2, get out r0 */ - "vmul.f32 q8, q5, q0 @ w0 * inr00\n" - "vmul.f32 q9, q5, q1 @ w0 * inr01\n" - "vmul.f32 q10, q5, q2 @ w0 * inr02\n" - "vmul.f32 q11, q5, q3 @ w0 * inr03\n" - "vmla.f32 q8, q6, q1 @ w1 * inr01\n" - "vld1.32 {d0-d3}, [%[r0]] @ load r0, q0, q1\n" - "vmla.f32 q9, q6, q2 @ w1 * inr02\n" - "vmla.f32 q10, q6, q3 @ w1 * inr03\n" - "vmla.f32 q11, q6, q0 @ w1 * inr04\n" - "vmla.f32 q8, q7, q2 @ w2 * inr02\n" - "vmla.f32 q9, q7, q3 @ w2 * inr03\n" - "vld1.32 {d4-d7}, [%[r1]]! @ load r0, q2, q3\n" - "vmla.f32 q10, q7, q0 @ w2 * inr04\n" - "vmla.f32 q11, q7, q1 @ w2 * inr05\n" - "vld1.32 {d0-d3}, [%[r1]]! @ load r0, q0, q1\n" - "vld1.32 {d8-d9}, [%[wc0]]! @ load w3 to q4\n" - /* mul r1 with w0-w5, get out r0, r1 */ - "vmul.f32 q12, q5, q2 @ w0 * inr10\n" - "vmul.f32 q13, q5, q3 @ w0 * inr11\n" - "vmul.f32 q14, q5, q0 @ w0 * inr12\n" - "vmul.f32 q15, q5, q1 @ w0 * inr13\n" - "vld1.32 {d10-d11}, [%[wc0]]! @ load w4 to q5\n" - "vmla.f32 q8, q4, q2 @ w3 * inr10\n" - "vmla.f32 q9, q4, q3 @ w3 * inr11\n" - "vmla.f32 q10, q4, q0 @ w3 * inr12\n" - "vmla.f32 q11, q4, q1 @ w3 * inr13\n" - /* mul r1 with w1, w4, get out r1, r0 */ - "vmla.f32 q8, q5, q3 @ w4 * inr11\n" - "vmla.f32 q12, q6, q3 @ w1 * inr11\n" - "vld1.32 {d4-d7}, [%[r1]] @ load r1, q2, q3\n" - "vmla.f32 q9, q5, q0 @ w4 * inr12\n" - "vmla.f32 q13, q6, q0 @ w1 * inr12\n" - "vmla.f32 q10, q5, q1 @ w4 * inr13\n" - "vmla.f32 q14, q6, q1 @ w1 * inr13\n" - "vmla.f32 q11, q5, q2 @ w4 * inr14\n" - "vmla.f32 q15, q6, q2 @ w1 * inr14\n" - "vld1.32 {d12-d13}, [%[wc0]]! @ load w5 to q6\n" - /* mul r1 with w2, w5, get out r1, r0 */ - "vmla.f32 q12, q7, q0 @ w2 * inr12\n" - "vmla.f32 q13, q7, q1 @ w2 * inr13\n" - "vmla.f32 q8, q6, q0 @ w5 * inr12\n" - "vmla.f32 q9, q6, q1 @ w5 * inr13\n" - "vld1.32 {d0-d3}, [%[r2]]! @ load r2, q0, q1\n" - "vmla.f32 q14, q7, q2 @ w2 * inr14\n" - "vmla.f32 q15, q7, q3 @ w2 * inr15\n" - "vmla.f32 q10, q6, q2 @ w5 * inr14\n" - "vmla.f32 q11, q6, q3 @ w5 * inr15\n" - "vld1.32 {d4-d7}, [%[r2]]! @ load r2, q0, q1\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w6, to q7\n" - /* mul r2 with w3-w8, get out r0, r1 */ - "vmla.f32 q12, q4, q0 @ w3 * inr20\n" - "vmla.f32 q13, q4, q1 @ w3 * inr21\n" - "vmla.f32 q14, q4, q2 @ w3 * inr22\n" - "vmla.f32 q15, q4, q3 @ w3 * inr23\n" - "vld1.32 {d8-d9}, [%[wc0]]! @ load w7, to q4\n" - "vmla.f32 q8, q7, q0 @ w6 * inr20\n" - "vmla.f32 q9, q7, q1 @ w6 * inr21\n" - "vmla.f32 q10, q7, q2 @ w6 * inr22\n" - "vmla.f32 q11, q7, q3 @ w6 * inr23\n" - /* mul r2 with w4, w7, get out r1, r0 */ - "vmla.f32 q8, q4, q1 @ w7 * inr21\n" - "vmla.f32 q12, q5, q1 @ w4 * inr21\n" - "vld1.32 {d0-d3}, [%[r2]] @ load r2, q0, q1\n" - "vmla.f32 q9, q4, q2 @ w7 * inr22\n" - "vmla.f32 q13, q5, q2 @ w4 * inr22\n" - "vmla.f32 q10, q4, q3 @ w7 * inr23\n" - "vmla.f32 q14, q5, q3 @ w4 * inr23\n" - "vmla.f32 q11, q4, q0 @ w7 * inr24\n" - "vmla.f32 q15, q5, q0 @ w4 * inr24\n" - "vld1.32 {d10-d11}, [%[wc0]]! @ load w8 to q5\n" - /* mul r1 with w5, w8, get out r1, r0 */ - "vmla.f32 q12, q6, q2 @ w5 * inr22\n" - "vmla.f32 q13, q6, q3 @ w5 * inr23\n" - "vmla.f32 q8, q5, q2 @ w8 * inr22\n" - "vmla.f32 q9, q5, q3 @ w8 * inr23\n" - "vld1.32 {d4-d7}, [%[r3]]! @ load r3, q2, q3\n" - "ldr r4, [%[outl], #32] @ load bias addr to r4\n" - "vmla.f32 q14, q6, q0 @ w5 * inr24\n" - "vmla.f32 q15, q6, q1 @ w5 * inr25\n" - "vmla.f32 q10, q5, q0 @ w8 * inr24\n" - "vmla.f32 q11, q5, q1 @ w8 * inr25\n" - "vld1.32 {d0-d3}, [%[r3]]! @ load r3, q0, q1\n" - "sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" - /* mul r3 with w6, w7, w8, get out r1 */ - "vmla.f32 q12, q7, q2 @ w6 * inr30\n" - "vmla.f32 q13, q7, q3 @ w6 * inr31\n" - "vmla.f32 q14, q7, q0 @ w6 * inr32\n" - "vmla.f32 q15, q7, q1 @ w6 * inr33\n" - "vmla.f32 q12, q4, q3 @ w7 * inr31\n" - "vld1.32 {d4-d7}, [%[r3]] @ load r3, q2, q3\n" - "vld1.32 {d12-d13}, [r4] @ load bias\n" - "vmla.f32 q13, q4, q0 @ w7 * inr32\n" - "vmla.f32 q14, q4, q1 @ w7 * inr33\n" - "vmla.f32 q15, q4, q2 @ w7 * inr34\n" - "ldr r0, [%[outl]] @ load outc00 to r0\n" - "vmla.f32 q12, q5, q0 @ w8 * inr32\n" - "vmla.f32 q13, q5, q1 @ w8 * inr33\n" - "ldr r5, [%[outl], #36] @ load flag_relu to r5\n" - "vmla.f32 q14, q5, q2 @ w8 * inr34\n" - "vmla.f32 q15, q5, q3 @ w8 * inr35\n" - "ldr r1, [%[outl], #4] @ load outc10 to r1\n" - "vadd.f32 q8, q8, q6 @ r00 add bias\n" - "vadd.f32 q9, q9, q6 @ r01 add bias\n" - "vadd.f32 q10, q10, q6 @ r02 add bias\n" - "vadd.f32 q11, q11, q6 @ r03 add bias\n" - "ldr r2, [%[outl], #8] @ load outc20 to r2\n" - "vadd.f32 q12, q12, q6 @ r10 add bias\n" - "vadd.f32 q13, q13, q6 @ r11 add bias\n" - "vadd.f32 q14, q14, q6 @ r12 add bias\n" - "vadd.f32 q15, q15, q6 @ r13 add bias\n" - "ldr r3, [%[outl], #12] @ load outc30 to r3\n" - "vmov.u32 q7, #0 @ mov zero to q7\n" - "cmp r5, #0 @ cmp flag relu\n" - "beq 1f @ skip relu\n" - "vmax.f32 q8, q8, q7 @ r00 relu\n" - "vmax.f32 q9, q9, q7 @ r01 relu\n" - "vmax.f32 q10, q10, q7 @ r02 relu\n" - "vmax.f32 q11, q11, q7 @ r03 relu\n" - "vmax.f32 q12, q12, q7 @ r10 relu\n" - "vmax.f32 q13, q13, q7 @ r11 relu\n" - "vmax.f32 q14, q14, q7 @ r12 relu\n" - "vmax.f32 q15, q15, q7 @ r13 relu\n" - "1:\n" - "ldr r4, [%[outl], #16] @ load outc01 to r4\n" - "vtrn.32 q8, q9 @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n" - "vtrn.32 q10, q11 @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n" - "vtrn.32 q12, q13 @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n" - "vtrn.32 q14, q15 @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n" - "ldr r5, [%[outl], #20] @ load outc11 to r5\n" - "vswp d17, d20 @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n" - "vswp d19, d22 @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n" - "vswp d25, d28 @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n" - "vswp d27, d30 @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n" - "cmp %[flag_mask], #0 @ cmp flag mask\n" - "bne 2f\n" - "vst1.32 {d16-d17}, [r0] @ save outc00\n" - "vst1.32 {d18-d19}, [r1] @ save outc10\n" - "vst1.32 {d20-d21}, [r2] @ save outc20\n" - "vst1.32 {d22-d23}, [r3] @ save outc30\n" - "vst1.32 {d24-d25}, [r4] @ save outc01\n" - "vst1.32 {d26-d27}, [r5] @ save outc11\n" - "ldr r0, [%[outl], #24] @ load outc21 to r0\n" - "ldr r1, [%[outl], #28] @ load outc31 to r1\n" - "vst1.32 {d28-d29}, [r0] @ save outc21\n" - "vst1.32 {d30-d31}, [r1] @ save outc31\n" - "b 3f @ branch end\n" - "2: \n" - "vst1.32 {d16-d17}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d18-d19}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d20-d21}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d22-d23}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d24-d25}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d26-d27}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d28-d29}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d30-d31}, [%[out0]]! @ save remain to pre_out\n" - "3: \n" - : [r0] "+r"(inr0), [r1] "+r"(inr1), - [r2] "+r"(inr2), [r3] "+r"(inr3), - [out0] "+r"(out0), [wc0] "+r"(weight_c) - : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr) - : "cc", "memory", - "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5" - ); -#endif // __arch64__ - // clang-format on - outl[0] += 4; - outl[1] += 4; - outl[2] += 4; - outl[3] += 4; - outl[4] += 4; - outl[5] += 4; - outl[6] += 4; - outl[7] += 4; - if (flag_mask) { - memcpy(outl[0] - 4, pre_out, remain * sizeof(float)); - memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float)); - memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float)); - memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float)); - memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float)); - memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float)); - memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float)); - memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float)); - } - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc deleted file mode 100644 index 2d75323a9677f1cfbed726a1a28920dd77131688..0000000000000000000000000000000000000000 --- a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc +++ /dev/null @@ -1,361 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_3x3s2_depthwise_fp32(const float* i_data, - float* o_data, - int bs, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - int threads = ctx->threads(); - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; - const int out_c_block = 4; - const int out_h_kernel = 1; - const int out_w_kernel = 4; - const int win_ext = ow * 2 + 1; - const int ow_round = ROUNDUP(ow, 4); - const int win_round = ROUNDUP(win_ext, 4); - const int hin_round = oh * 2 + 1; - const int prein_size = win_round * hin_round * out_c_block; - auto workspace_size = - threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; - ctx->ExtendWorkspace(sizeof(float) * workspace_size); - - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - - /// get workspace - auto ptr_zero = ctx->workspace_data(); - memset(ptr_zero, 0, sizeof(float) * win_round); - float* ptr_write = ptr_zero + win_round; - - int size_in_channel = win * ih; - int size_out_channel = ow * oh; - - int ws = -pad_w; - int we = ws + win_round; - int hs = -pad_h; - int he = hs + hin_round; - int w_loop = ow_round / 4; - auto remain = w_loop * 4 - ow; - bool flag_remain = remain > 0; - remain = 4 - remain; - remain = remain > 0 ? remain : 0; - int row_len = win_round * out_c_block; - - for (int n = 0; n < bs; ++n) { - const float* din_batch = i_data + n * ic * size_in_channel; - float* dout_batch = o_data + n * oc * size_out_channel; -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < oc; c += out_c_block) { -#ifdef ARM_WITH_OMP - float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size; -#else - float* pre_din = ptr_write + ow_round; -#endif - /// const array size - prepack_input_nxwc4_dw( - din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero); - const float* weight_c = weights + c * 9; // kernel_w * kernel_h - float* dout_c00 = dout_batch + c * size_out_channel; - float bias_local[4] = {0, 0, 0, 0}; - if (flag_bias) { - bias_local[0] = bias[c]; - bias_local[1] = bias[c + 1]; - bias_local[2] = bias[c + 2]; - bias_local[3] = bias[c + 3]; - } -#ifdef __aarch64__ - float32x4_t w0 = vld1q_f32(weight_c); // w0, v23 - float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24 - float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25 - float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26 - float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27 - float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28 - float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29 - float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30 - float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31 -#endif - for (int h = 0; h < oh; h += out_h_kernel) { - float* outc0 = dout_c00 + h * ow; - float* outc1 = outc0 + size_out_channel; - float* outc2 = outc1 + size_out_channel; - float* outc3 = outc2 + size_out_channel; - const float* inr0 = pre_din + h * 2 * row_len; - const float* inr1 = inr0 + row_len; - const float* inr2 = inr1 + row_len; - if (c + out_c_block > oc) { - switch (c + out_c_block - oc) { - case 3: - outc1 = ptr_write; - case 2: - outc2 = ptr_write; - case 1: - outc3 = ptr_write; - default: - break; - } - } - auto c0 = outc0; - auto c1 = outc1; - auto c2 = outc2; - auto c3 = outc3; - float pre_out[16]; - for (int w = 0; w < w_loop; ++w) { - bool flag_mask = (w == w_loop - 1) && flag_remain; - if (flag_mask) { - c0 = outc0; - c1 = outc1; - c2 = outc2; - c3 = outc3; - outc0 = pre_out; - outc1 = pre_out + 4; - outc2 = pre_out + 8; - outc3 = pre_out + 12; - } -// clang-format off -#ifdef __aarch64__ - asm volatile( - "ldr q8, [%[bias]]\n" /* load bias */ - "ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ - "and v19.16b, v8.16b, v8.16b\n" - "ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ - "and v20.16b, v8.16b, v8.16b\n" - "ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/ - "and v21.16b, v8.16b, v8.16b\n" - "ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/ - "and v22.16b, v8.16b, v8.16b\n" - "ldr q8, [%[inr0]]\n" /* load input r0*/ - /* r0 mul w0-w2, get out */ - "fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/ - "fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/ - "fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/ - "fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/ - "fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/ - "ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/ - "fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/ - "fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/ - "fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/ - "fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/ - "ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/ - "fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/ - "ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/ - "fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/ - "ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ - "fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/ - "ldr q8, [%[inr1]]\n" /* load input r1*/ - /* r1, mul w3-w5, get out */ - "fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/ - "fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/ - "fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/ - "fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/ - "fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/ - "ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ - "fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/ - "fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/ - "fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/ - "fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/ - "ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ - "fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/ - "ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/ - "fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/ - "ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/ - "fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/ - "ldr q8, [%[inr2]]\n" /* load input r2*/ - /* r2, mul w6-w8, get out r0, r1 */ - "fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/ - "fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/ - "fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/ - "fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/ - "fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/ - "fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/ - "fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/ - "fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/ - "fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/ - "fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/ - "fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/ - "fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/ - /* transpose */ - "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/ - "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/ - "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/ - "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/ - "trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ - "trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ - "trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ - "trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/ - /* relu */ - "cbz %w[flag_relu], 0f\n" /* skip relu*/ - "movi v0.4s, #0\n" /* for relu */ - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - /* save result */ - "0:\n" - "str q19, [%[outc0]], #16\n" - "str q20, [%[outc1]], #16\n" - "str q21, [%[outc2]], #16\n" - "str q22, [%[outc3]], #16\n" - :[inr0] "+r"(inr0), [inr1] "+r"(inr1), - [inr2] "+r"(inr2), - [outc0]"+r"(outc0), [outc1]"+r"(outc1), - [outc2]"+r"(outc2), [outc3]"+r"(outc3) - :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), - [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5), - [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8), - [bias] "r" (bias_local), [flag_relu]"r"(flag_relu) - : "cc", "memory", - "v0","v1","v2","v3","v4","v5","v6","v7", - "v8", "v19","v20","v21","v22" - ); -#else - asm volatile( - /* fill with bias */ - "vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */ - /* load weights */ - "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */ - "vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/ - "vand.i32 q12, q8, q8\n" - "vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/ - "vand.i32 q13, q8, q8\n" - "vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/ - "vand.i32 q14, q8, q8\n" - "vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/ - "vand.i32 q15, q8, q8\n" - "vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/ - /* mul r0 with w0, w1, w2 */ - "vmla.f32 q12, q9, q0 @ w0 * inr0\n" - "vmla.f32 q13, q9, q2 @ w0 * inr2\n" - "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */ - "vmla.f32 q14, q9, q4 @ w0 * inr4\n" - "vmla.f32 q15, q9, q6 @ w0 * inr6\n" - "vmla.f32 q12, q10, q1 @ w1 * inr1\n" - "vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n" - "vmla.f32 q13, q10, q3 @ w1 * inr3\n" - "vmla.f32 q14, q10, q5 @ w1 * inr5\n" - "vmla.f32 q15, q10, q7 @ w1 * inr7\n" - "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */ - "vmla.f32 q12, q11, q2 @ w2 * inr2\n" - "vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n" - "vmla.f32 q13, q11, q4 @ w2 * inr4\n" - "vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n" - "vmla.f32 q14, q11, q6 @ w2 * inr6\n" - "vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n" - "vmla.f32 q15, q11, q8 @ w2 * inr8\n" - /* mul r1 with w3, w4, w5 */ - "vmla.f32 q12, q9, q0 @ w3 * inr0\n" - "vmla.f32 q13, q9, q2 @ w3 * inr2\n" - "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */ - "vmla.f32 q14, q9, q4 @ w3 * inr4\n" - "vmla.f32 q15, q9, q6 @ w3 * inr6\n" - "vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/ - "vmla.f32 q12, q10, q1 @ w4 * inr1\n" - "vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n" - "vmla.f32 q13, q10, q3 @ w4 * inr3\n" - "vmla.f32 q14, q10, q5 @ w4 * inr5\n" - "vmla.f32 q15, q10, q7 @ w4 * inr7\n" - "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */ - "vmla.f32 q12, q11, q2 @ w5 * inr2\n" - "vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n" - "vmla.f32 q13, q11, q4 @ w5 * inr4\n" - "vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n" - "vmla.f32 q14, q11, q6 @ w5 * inr6\n" - "vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n" - "vmla.f32 q15, q11, q8 @ w5 * inr8\n" - /* mul r2 with w6, w7, w8 */ - "vmla.f32 q12, q9, q0 @ w6 * inr0\n" - "vmla.f32 q13, q9, q2 @ w6 * inr2\n" - "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */ - "vmla.f32 q14, q9, q4 @ w6 * inr4\n" - "vmla.f32 q15, q9, q6 @ w6 * inr6\n" - "vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/ - "vmla.f32 q12, q10, q1 @ w7 * inr1\n" - "vmla.f32 q13, q10, q3 @ w7 * inr3\n" - "vmla.f32 q14, q10, q5 @ w7 * inr5\n" - "vmla.f32 q15, q10, q7 @ w7 * inr7\n" - "sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" - "vmla.f32 q12, q11, q2 @ w8 * inr2\n" - "vmla.f32 q13, q11, q4 @ w8 * inr4\n" - "vmla.f32 q14, q11, q6 @ w8 * inr6\n" - "vmla.f32 q15, q11, q8 @ w8 * inr8\n" - /* transpose */ - "vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/ - "vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/ - "vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/ - "vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/ - "cmp %[flag_relu], #0\n" - "beq 0f\n" /* skip relu*/ - "vmov.u32 q0, #0\n" - "vmax.f32 q12, q12, q0\n" - "vmax.f32 q13, q13, q0\n" - "vmax.f32 q14, q14, q0\n" - "vmax.f32 q15, q15, q0\n" - "0:\n" - "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/ - "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/ - "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/ - "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/ - :[r0] "+r"(inr0), [r1] "+r"(inr1), - [r2] "+r"(inr2), [wc0] "+r" (weight_c), - [outc0]"+r"(outc0), [outc1]"+r"(outc1), - [outc2]"+r"(outc2), [outc3]"+r"(outc3) - :[bias] "r" (bias_local), - [flag_relu]"r"(flag_relu) - :"cc", "memory", - "q0","q1","q2","q3","q4","q5","q6","q7", - "q8", "q9","q10","q11","q12","q13","q14","q15" - ); -#endif // __arch64__ - // clang-format off - if (flag_mask) { - for (int i = 0; i < remain; ++i) { - c0[i] = pre_out[i]; - c1[i] = pre_out[i + 4]; - c2[i] = pre_out[i + 8]; - c3[i] = pre_out[i + 12]; - } - } - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3p0.cc b/lite/backends/arm/math/conv_depthwise_3x3p0.cc deleted file mode 100644 index 0c050ffe6fb0f064f5c26ea0da6acee17f4403ae..0000000000000000000000000000000000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3p0.cc +++ /dev/null @@ -1,4178 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s1p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3p0_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (stride == 1) { - if (flag_relu) { - if (w_in > 5) { - conv_depthwise_3x3s1p0_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 5) { - conv_depthwise_3x3s1p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } else { //! stride = 2 - if (flag_relu) { - if (w_in > 8) { - conv_depthwise_3x3s2p0_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p0_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 8) { - conv_depthwise_3x3s2p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -// 4line -void conv_depthwise_3x3s1p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = w_out >> 2; - int remain = w_out % 4; - - unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); - const int remian_idx[4] = {0, 1, 2, 3}; - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - // wr0 = vsetq_lane_f32(0.f, wr0, 3); - // wr1 = vsetq_lane_f32(0.f, wr1, 3); - // wr2 = vsetq_lane_f32(0.f, wr2, 3); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_out; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 >= h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - case 0: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = tile_w; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - - // mid - // "cmp %[cnt], #1 \n" - // "blt 5f \n" - "4: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 4b \n" - - // right - "5: \n" - "cmp %[remain], #1 \n" - "blt 0f \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v18.4s}, [%[rmask]] \n" - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - // end - "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - //! process bottom pad - if (i + 3 >= h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - case 0: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = tile_w; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r3\n" - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "cmp %[remain], #1 @ check whether has " - "mid cols\n" - "blt 0f @ jump to main loop start " - "point\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - "0: \n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - */ -// w_in > 7 -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - - int tile_w = w_out >> 2; - int cnt_remain = w_out % 4; - - unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - dr0 = dr4; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 >= h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - case 0: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = tile_w; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_out; i++) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = tile_w; - unsigned int* mask_ptr = dmask; - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - // mid - "2: \n" - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -// 4line -void conv_depthwise_3x3s1p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = w_out >> 2; - int remain = w_out % 4; - - unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); - const int remian_idx[4] = {0, 1, 2, 3}; - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - // wr0 = vsetq_lane_f32(0.f, wr0, 3); - // wr1 = vsetq_lane_f32(0.f, wr1, 3); - // wr2 = vsetq_lane_f32(0.f, wr2, 3); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_out; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 >= h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - case 0: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = tile_w; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - - // mid - "4: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - // r5 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 4b \n" - - // right - "5: \n" - "cmp %[remain], #1 \n" - "blt 0f \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v18.4s}, [%[rmask]] \n" - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - // end - "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_out; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - //! process bottom pad - if (i + 3 >= h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - case 0: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = tile_w; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r3\n" - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "cmp %[remain], #1 @ check whether has " - "mid cols\n" - "blt 0f @ jump to main loop start " - "point\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - "0: \n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, with reulu - */ -// w_in > 7 -void conv_depthwise_3x3s2p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - - int tile_w = w_out >> 2; - int cnt_remain = w_out % 4; - - unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - dr0 = dr4; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 >= h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - case 0: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = tile_w; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v14.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_out; i++) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = tile_w; - unsigned int* mask_ptr = dmask; - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - // mid - "2: \n" - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - "vmax.f32 q3, q3, q9 @ relu \n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp1 = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); - uint32x4_t vmask_rp2 = - vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_out; j += 2) { - const float* dr0 = din_channel + j * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - doutr0 = dout_channel + j * w_out; - doutr1 = doutr0 + w_out; - - if (j + 3 >= h_in) { - switch (j + 3 - h_in) { - case 3: - dr1 = zero_ptr; - case 2: - dr2 = zero_ptr; - case 1: - dr3 = zero_ptr; - doutr1 = trash_buf; - case 0: - dr3 = zero_ptr; - doutr1 = trash_buf; - default: - break; - } - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s, v1.4s}, [%[din0]]\n" - "ld1 {v2.4s, v3.4s}, [%[din1]]\n" - "ld1 {v4.4s, v5.4s}, [%[din2]]\n" - "ld1 {v6.4s, v7.4s}, [%[din3]]\n" - - "bif v0.16b, %[zero].16b, %[mask1].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask2].16b\n" // d0_1234 - - "bif v2.16b, %[zero].16b, %[mask1].16b\n" // d1_1234 - "bif v3.16b, %[zero].16b, %[mask2].16b\n" // d1_1234 - - "bif v4.16b, %[zero].16b, %[mask1].16b\n" // d2_1234 - "bif v5.16b, %[zero].16b, %[mask2].16b\n" // d2_1234 - - "bif v6.16b, %[zero].16b, %[mask1].16b\n" // d3_1234 - "bif v7.16b, %[zero].16b, %[mask2].16b\n" // d3_1234 - - "ext v8.16b, v0.16b, v1.16b, #4\n" // d1_2345 - "ext v9.16b, v0.16b, v1.16b, #8\n" // d1_3450 - - "and v12.16b, %[vbias].16b, %[vbias].16b \n" // v12 = vbias - "and v13.16b, %[vbias].16b, %[vbias].16b \n" // v13 = vbias - - // r0 - "fmul v10.4s, v0.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmul v11.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v12.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v2.16b, v3.16b, #4\n" // d1_2345 - "ext v9.16b, v2.16b, v3.16b, #8\n" // d1_3450 - - // r1 - "fmul v14.4s, v2.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v2.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - - "fmul v15.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v4.16b, v5.16b, #4\n" // d1_2345 - "ext v9.16b, v4.16b, v5.16b, #8\n" // d1_3450 - - // r2 - "fmla v14.4s, v4.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v4.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v6.16b, v7.16b, #4\n" // d1_2345 - "ext v9.16b, v6.16b, v7.16b, #8\n" // d1_3450 - - // r3 - "fmla v14.4s, v6.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fadd v12.4s, v12.4s, v10.4s\n" - - "fmla v13.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "fadd v12.4s, v12.4s, v11.4s\n" // out1 - "fadd v13.4s, v13.4s, v14.4s\n" // out2 - "fadd v13.4s, v13.4s, v15.4s\n" // out2 - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); -#else - unsigned int* vmask_ptr = vmask; - float bias_val = flag_bias ? bias[i] : 0.f; - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - "vadd.f32 q4, q4, q10 @ q4 += q10 \n" - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vadd.f32 q4, q4, q11 @ q4 += q10 \n" - - "vadd.f32 q5, q5, q8 @ q4 += q10 \n" - "vadd.f32 q5, q5, q9 @ q4 += q10 \n" - - "vst1.32 {d8-d9}, [%[out1]] @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[out2]] @ store result, add pointer\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ - -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - float out_buf[4]; - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - for (int j = 0; j < h_out; ++j) { - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - "and v4.16b, %[bias].16b, %[bias].16b \n" // v10 = vbias - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v10.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v7.16b, v12.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v8.16b, v14.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - - "fmla v4.4s, v10.4s, %[wr0].s[0] \n" // 0246 * w00 - "fmul v5.4s, v11.4s, %[wr0].s[1] \n" // 1357 * w01 - "fmul v16.4s, v6.4s, %[wr0].s[2] \n" // 2468 * w02 - - "fmla v4.4s, v12.4s, %[wr1].s[0] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[1] \n" // v13 * w12 - "fmla v16.4s, v7.4s, %[wr1].s[2] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[0] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[1] \n" // v15 * w21 - "fmla v16.4s, v8.4s, %[wr2].s[2] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v16.4s \n" - - // "fadd v4.4s, v4.4s, %[bias].4s \n" - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,0} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q7 = {2,4,6,0} - "vext.32 q8, q14, q9, #1 @ shift left 1 \n" // q8 = {2,4,6,0} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // {0,2,4,6} - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // {1,3,5,7} - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // {2,4,6,0} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(dmask) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp1 = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); - uint32x4_t vmask_rp2 = - vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_out; j += 2) { - const float* dr0 = din_channel + j * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - doutr0 = dout_channel + j * w_out; - doutr1 = doutr0 + w_out; - - if (j + 3 >= h_in) { - switch (j + 3 - h_in) { - case 3: - dr1 = zero_ptr; - case 2: - dr2 = zero_ptr; - case 1: - dr3 = zero_ptr; - doutr1 = trash_buf; - case 0: - dr3 = zero_ptr; - doutr1 = trash_buf; - default: - break; - } - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s, v1.4s}, [%[din0]]\n" - "ld1 {v2.4s, v3.4s}, [%[din1]]\n" - "ld1 {v4.4s, v5.4s}, [%[din2]]\n" - "ld1 {v6.4s, v7.4s}, [%[din3]]\n" - - "bif v0.16b, %[zero].16b, %[mask1].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask2].16b\n" // d0_1234 - - "bif v2.16b, %[zero].16b, %[mask1].16b\n" // d1_1234 - "bif v3.16b, %[zero].16b, %[mask2].16b\n" // d1_1234 - - "bif v4.16b, %[zero].16b, %[mask1].16b\n" // d2_1234 - "bif v5.16b, %[zero].16b, %[mask2].16b\n" // d2_1234 - - "bif v6.16b, %[zero].16b, %[mask1].16b\n" // d3_1234 - "bif v7.16b, %[zero].16b, %[mask2].16b\n" // d3_1234 - - "ext v8.16b, v0.16b, v1.16b, #4\n" // d1_2345 - "ext v9.16b, v0.16b, v1.16b, #8\n" // d1_3450 - - "and v12.16b, %[vbias].16b, %[vbias].16b \n" // v12 = vbias - "and v13.16b, %[vbias].16b, %[vbias].16b \n" // v13 = vbias - - // r0 - "fmul v10.4s, v0.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmul v11.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v12.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v2.16b, v3.16b, #4\n" // d1_2345 - "ext v9.16b, v2.16b, v3.16b, #8\n" // d1_3450 - - // r1 - "fmul v14.4s, v2.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v2.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - - "fmul v15.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v4.16b, v5.16b, #4\n" // d1_2345 - "ext v9.16b, v4.16b, v5.16b, #8\n" // d1_3450 - - // r2 - "fmla v14.4s, v4.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v4.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v6.16b, v7.16b, #4\n" // d1_2345 - "ext v9.16b, v6.16b, v7.16b, #8\n" // d1_3450 - - // r3 - "fmla v14.4s, v6.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fadd v12.4s, v12.4s, v10.4s\n" - - "fmla v13.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "fadd v12.4s, v12.4s, v11.4s\n" // out1 - "fadd v13.4s, v13.4s, v14.4s\n" // out2 - "fadd v13.4s, v13.4s, v15.4s\n" // out2 - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - "fmax v12.4s, v12.4s, %[zero].4s \n" - "fmax v13.4s, v13.4s, %[zero].4s \n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); -#else - unsigned int* vmask_ptr = vmask; - float bias_val = flag_bias ? bias[i] : 0.f; - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - "vadd.f32 q4, q4, q10 @ q4 += q10 \n" - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vadd.f32 q4, q4, q11 @ q4 += q10 \n" - - "vadd.f32 q5, q5, q8 @ q4 += q10 \n" - "vadd.f32 q5, q5, q9 @ q4 += q10 \n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vst1.32 {d8-d9}, [%[out1]] @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[out2]] @ store result, add pointer\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - // doutr0 = doutr1; - // doutr1 += w_out; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 7 - */ -void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - float out_buf[4]; - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - for (int j = 0; j < h_out; ++j) { - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]] \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - "and v4.16b, %[bias].16b, %[bias].16b \n" // v10 = vbias - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v10.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v7.16b, v12.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v8.16b, v14.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - - "fmla v4.4s, v10.4s, %[wr0].s[0] \n" // 0246 * w00 - "fmul v5.4s, v11.4s, %[wr0].s[1] \n" // 1357 * w01 - "fmul v16.4s, v6.4s, %[wr0].s[2] \n" // 2468 * w02 - - "fmla v4.4s, v12.4s, %[wr1].s[0] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[1] \n" // v13 * w12 - "fmla v16.4s, v7.4s, %[wr1].s[2] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[0] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[1] \n" // v15 * w21 - "fmla v16.4s, v8.4s, %[wr2].s[2] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v16.4s \n" - "fmax v4.4s, v4.4s, v9.4s \n" - - // "fadd v4.4s, v4.4s, %[bias].4s \n" - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf), - [mask_ptr] "r"(mask_ptr) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,0} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q7 = {2,4,6,0} - "vext.32 q8, q14, q9, #1 @ shift left 1 \n" // q8 = {2,4,6,0} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // {0,2,4,6} - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // {1,3,5,7} - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // {2,4,6,0} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(mask_ptr) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3p1.cc b/lite/backends/arm/math/conv_depthwise_3x3p1.cc deleted file mode 100644 index 6f28d48d6d2bdd60e0c33f9b4b753835337fc8a4..0000000000000000000000000000000000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3p1.cc +++ /dev/null @@ -1,4850 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s1p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3p1_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (stride == 1) { - if (flag_relu) { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } else { //! stride = 2 - if (flag_relu) { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -// 4line -void conv_depthwise_3x3s1p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - // printf("conv3x3_dw start \n"); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - - // left - // r0 - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * - w0[1]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * - w0[0]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * - w0[2]*/ - - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * - w1[1]*/ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ - - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ - "cmp %[cnt], #1 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "blt 3f \n" - // mid - "1: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 1b \n" - - // right - "3: \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v18.4s}, [%[rmask]] \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" - "vext.32 q7, q8, q9, #1 @ 1234\n" - - // left - // r0 - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" - - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" - "vext.32 q7, q10, q11, #1 @ 1234\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" - "vext.32 q7, q12, q13, #1 @ 1234\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" - "vext.32 q7, q14, q15, #1 @ 1234\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "cmp %[cnt], #1 @ check whether has " - "mid cols\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - "blt 3f @ jump to main loop start " - "point\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - */ -// w_in > 7 -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" // v10 = {0,1,3,5} - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmul v12.4s, v1.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr0], %[inptr0], #4 \n" - "sub %[inptr1], %[inptr1], #4 \n" - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v12.4s, v3.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr2], %[inptr2], #4 \n" - "sub %[inptr3], %[inptr3], #4 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmla v11.4s, v4.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - - "fmul v14.4s, v5.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v12.4s, v5.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - - "fmla v17.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - "fmla v16.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr4], %[inptr4], #4 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v7.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" // v10 = {0,1,3,5} - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v9.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "cmp %[cnt], #1 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "blt 1f \n" - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - unsigned int* mask_ptr = dmask; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q10, q11 - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q12, q13 - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v13={0,2,4,6} v14={1,3,5,7}, q14, q15 - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vext.32 q6, q9, q11, #3 @ shift right 1 " - "data\n" // q2 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, " - "out1\n" // q0 * w01 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, " - "out1\n" // q1 * w02 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, " - "out1\n" // q2 * w00 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "cmp %[cnt], #1 \n" - "blt 1f \n" - // mid - "2: \n" - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -// 4line -void conv_depthwise_3x3s1p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - // printf("conv3x3_dw start \n"); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int tile_h = (h_in + 3) >> 2; - int cnt_col = tile_w - 2; - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - - // left - // r0 - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * - w0[1]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * - w0[0]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * - w0[2]*/ - - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * - w1[1]*/ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ - - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ - "cmp %[cnt], #1 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "blt 3f \n" - // mid - "1: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 1b \n" - - // right - "3: \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v18.4s}, [%[rmask]] \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" - "vext.32 q7, q8, q9, #1 @ 1234\n" - - // left - // r0 - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" - - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" - "vext.32 q7, q10, q11, #1 @ 1234\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" - "vext.32 q7, q12, q13, #1 @ 1234\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" - "vext.32 q7, q14, q15, #1 @ 1234\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "cmp %[cnt], #1 @ check whether has " - "mid cols\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - "blt 3f @ jump to main loop start " - "point\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, with reulu - */ -// w_in > 7 -void conv_depthwise_3x3s2p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" // v10 = {0,1,3,5} - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmul v12.4s, v1.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr0], %[inptr0], #4 \n" - "sub %[inptr1], %[inptr1], #4 \n" - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v12.4s, v3.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr2], %[inptr2], #4 \n" - "sub %[inptr3], %[inptr3], #4 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmla v11.4s, v4.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - - "fmul v14.4s, v5.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v12.4s, v5.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - - "fmla v17.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - "fmla v16.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr4], %[inptr4], #4 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v7.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" // v10 = {0,1,3,5} - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v9.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "cmp %[cnt], #1 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "blt 1f \n" - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v14.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - - unsigned int* mask_ptr = dmask; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q10, q11 - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q12, q13 - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v13={0,2,4,6} v14={1,3,5,7}, q14, q15 - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vext.32 q6, q9, q11, #3 @ shift right 1 " - "data\n" // q2 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, " - "out1\n" // q0 * w01 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, " - "out1\n" // q1 * w02 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, " - "out1\n" // q2 * w00 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "cmp %[cnt], #1 \n" - "blt 1f \n" - // mid - "2: \n" - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "subs %[cnt], #1 \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s}, [%[din0]], #16\n" - "ld1 {v1.4s}, [%[din1]], #16\n" - "ld1 {v2.4s}, [%[din2]], #16\n" - "ld1 {v3.4s}, [%[din3]], #16\n" - - "bif v0.16b, %[zero].16b, %[mask].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask].16b\n" // d1_1234 - "bif v2.16b, %[zero].16b, %[mask].16b\n" // d2_1234 - "bif v3.16b, %[zero].16b, %[mask].16b\n" // d3_1234 - - "ext v4.16b, %[zero].16b, v0.16b, #12\n" // d0_0123 - "ext v5.16b, %[zero].16b, v1.16b, #12\n" // d1_0123 - "ext v6.16b, %[zero].16b, v2.16b, #12\n" // d2_0123 - "ext v7.16b, %[zero].16b, v3.16b, #12\n" // d3_0123 - - "ext v8.16b, v0.16b, %[zero].16b, #4\n" // d0_2340 - "ext v9.16b, v1.16b, %[zero].16b, #4\n" // d1_2340 - "ext v10.16b, v2.16b, %[zero].16b, #4\n" // d2_2340 - "ext v11.16b, v3.16b, %[zero].16b, #4\n" // d3_2340 - - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" - - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" - - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" - - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" - - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" - - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" - - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" - - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" - - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" - - "fadd v12.4s, v12.4s, v14.4s\n" - "fadd v12.4s, v12.4s, v16.4s\n" - - "fadd v13.4s, v13.4s, v15.4s\n" // out1 - "fadd v13.4s, v13.4s, v17.4s\n" // out2 - - "fadd v12.4s, v12.4s, %[bias].4s\n" // out1 add bias - "fadd v13.4s, v13.4s, %[bias].4s\n" // out2 add bias - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); -#else - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d12-d13}, [%[din0]]!\n" - "vld1.32 {d14-d15}, [%[din1]]!\n" - "vld1.32 {d16-d17}, [%[din2]]!\n" - "vld1.32 {d18-d19}, [%[din3]]!\n" - - "vbif q6, %q[zero], %q[mask]\n" // d0_1234 - "vbif q7, %q[zero], %q[mask]\n" // d1_1234 - "vbif q8, %q[zero], %q[mask]\n" // d2_1234 - "vbif q9, %q[zero], %q[mask]\n" // d3_1234 - - "vmul.f32 q14, q6, %e[wr0][1]\n" - "vmul.f32 q15, q7, %e[wr0][1]\n" - - "vmla.f32 q14, q7, %e[wr1][1]\n" - "vmla.f32 q15, q8, %e[wr1][1]\n" - - "vmla.f32 q14, q8, %e[wr2][1]\n" - "vmla.f32 q15, q9, %e[wr2][1]\n" - - "vext.32 q10, %q[zero], q6, #3\n" // d0_0123 - "vext.32 q11, %q[zero], q7, #3\n" // d1_0123 - "vext.32 q12, %q[zero], q8, #3\n" // d2_0123 - "vext.32 q13, %q[zero], q9, #3\n" // d3_0123 - - "vmla.f32 q14, q10, %e[wr0][0]\n" - "vmla.f32 q15, q11, %e[wr0][0]\n" - - "vmla.f32 q14, q11, %e[wr1][0]\n" - "vmla.f32 q15, q12, %e[wr1][0]\n" - - "vmla.f32 q14, q12, %e[wr2][0]\n" - "vmla.f32 q15, q13, %e[wr2][0]\n" - - "vext.32 q10, q6, %q[zero], #1\n" // d0_2340 - "vext.32 q11, q7, %q[zero], #1\n" // d1_2340 - "vext.32 q12, q8, %q[zero], #1\n" // d2_2340 - "vext.32 q13, q9, %q[zero], #1\n" // d3_2340 - - "vmla.f32 q14, q10, %f[wr0][0]\n" - "vmla.f32 q15, q11, %f[wr0][0]\n" - - "vmla.f32 q14, q11, %f[wr1][0]\n" - "vmla.f32 q15, q12, %f[wr1][0]\n" - - "vmla.f32 q14, q12, %f[wr2][0]\n" // out1 - "vmla.f32 q15, q13, %f[wr2][0]\n" // out2 - - "vadd.f32 q14, q14, %q[bias]\n" // out1 add bias - "vadd.f32 q15, q15, %q[bias]\n" // out2 add bias - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vst1.32 {d28-d29}, [%[out1]]\n" - "vst1.32 {d30-d31}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ - -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - int hs = -1; - int he = 2; - float out_buf[4]; - for (int j = 0; j < h_out; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - if (hs == -1) { - dr0 = zeros; - } - if (he > h_in) { - dr2 = zeros; - } - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v9.16b, v11.16b, #12 \n" // v6 = - // {0,1,3,5} - "ext v7.16b, v9.16b, v13.16b, #12 \n" // v7 = - // {0,1,3,5} - "ext v8.16b, v9.16b, v15.16b, #12 \n" // v8 = - // {0,1,3,5} - - "fmul v4.4s, v10.4s, %[wr0].s[1] \n" // v10 * w01 - "fmul v5.4s, v11.4s, %[wr0].s[2] \n" // v11 * w02 - "fmul v6.4s, v6.4s, %[wr0].s[0] \n" // v6 * w00 - - "fmla v4.4s, v12.4s, %[wr1].s[1] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[2] \n" // v13 * w12 - "fmla v6.4s, v7.4s, %[wr1].s[0] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[1] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[2] \n" // v15 * w21 - "fmla v6.4s, v8.4s, %[wr2].s[0] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v6.4s \n" - - "fadd v4.4s, v4.4s, %[bias].4s \n" - - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q9, q11, #3 @ shift left 1 \n" // q6 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift left 1 \n" // q7 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift left 1 \n" // q8 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, " - "out0\n" // q10 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, " - "out0\n" // q11 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w00 - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - hs += 2; - he += 2; - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s}, [%[din0]], #16\n" - "ld1 {v1.4s}, [%[din1]], #16\n" - "ld1 {v2.4s}, [%[din2]], #16\n" - "ld1 {v3.4s}, [%[din3]], #16\n" - - "bif v0.16b, %[zero].16b, %[mask].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask].16b\n" // d1_1234 - "bif v2.16b, %[zero].16b, %[mask].16b\n" // d2_1234 - "bif v3.16b, %[zero].16b, %[mask].16b\n" // d3_1234 - - "ext v4.16b, %[zero].16b, v0.16b, #12\n" // d0_0123 - "ext v5.16b, %[zero].16b, v1.16b, #12\n" // d1_0123 - "ext v6.16b, %[zero].16b, v2.16b, #12\n" // d2_0123 - "ext v7.16b, %[zero].16b, v3.16b, #12\n" // d3_0123 - - "ext v8.16b, v0.16b, %[zero].16b, #4\n" // d0_2340 - "ext v9.16b, v1.16b, %[zero].16b, #4\n" // d1_2340 - "ext v10.16b, v2.16b, %[zero].16b, #4\n" // d2_2340 - "ext v11.16b, v3.16b, %[zero].16b, #4\n" // d3_2340 - - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" - - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" - - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" - - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" - - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" - - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" - - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" - - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" - - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" - - "fadd v12.4s, v12.4s, v14.4s\n" - "fadd v12.4s, v12.4s, v16.4s\n" - - "fadd v13.4s, v13.4s, v15.4s\n" // out1 - "fadd v13.4s, v13.4s, v17.4s\n" // out2 - - "fadd v12.4s, v12.4s, %[bias].4s\n" // out1 add bias - "fadd v13.4s, v13.4s, %[bias].4s\n" // out2 add bias - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "fmax v12.4s, v12.4s, %[zero].4s\n" // out1 -> relu - "fmax v13.4s, v13.4s, %[zero].4s\n" // out2 -> relu - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); -#else - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d12-d13}, [%[din0]]!\n" - "vld1.32 {d14-d15}, [%[din1]]!\n" - "vld1.32 {d16-d17}, [%[din2]]!\n" - "vld1.32 {d18-d19}, [%[din3]]!\n" - - "vbif q6, %q[zero], %q[mask]\n" // d0_1234 - "vbif q7, %q[zero], %q[mask]\n" // d1_1234 - "vbif q8, %q[zero], %q[mask]\n" // d2_1234 - "vbif q9, %q[zero], %q[mask]\n" // d3_1234 - - "vmul.f32 q14, q6, %e[wr0][1]\n" - "vmul.f32 q15, q7, %e[wr0][1]\n" - - "vmla.f32 q14, q7, %e[wr1][1]\n" - "vmla.f32 q15, q8, %e[wr1][1]\n" - - "vmla.f32 q14, q8, %e[wr2][1]\n" - "vmla.f32 q15, q9, %e[wr2][1]\n" - - "vext.32 q10, %q[zero], q6, #3\n" // d0_0123 - "vext.32 q11, %q[zero], q7, #3\n" // d1_0123 - "vext.32 q12, %q[zero], q8, #3\n" // d2_0123 - "vext.32 q13, %q[zero], q9, #3\n" // d3_0123 - - "vmla.f32 q14, q10, %e[wr0][0]\n" - "vmla.f32 q15, q11, %e[wr0][0]\n" - - "vmla.f32 q14, q11, %e[wr1][0]\n" - "vmla.f32 q15, q12, %e[wr1][0]\n" - - "vmla.f32 q14, q12, %e[wr2][0]\n" - "vmla.f32 q15, q13, %e[wr2][0]\n" - - "vext.32 q10, q6, %q[zero], #1\n" // d0_2340 - "vext.32 q11, q7, %q[zero], #1\n" // d1_2340 - "vext.32 q12, q8, %q[zero], #1\n" // d2_2340 - "vext.32 q13, q9, %q[zero], #1\n" // d3_2340 - - "vmla.f32 q14, q10, %f[wr0][0]\n" - "vmla.f32 q15, q11, %f[wr0][0]\n" - - "vmla.f32 q14, q11, %f[wr1][0]\n" - "vmla.f32 q15, q12, %f[wr1][0]\n" - - "vmla.f32 q14, q12, %f[wr2][0]\n" // out1 - "vmla.f32 q15, q13, %f[wr2][0]\n" // out2 - - "vadd.f32 q14, q14, %q[bias]\n" // out1 add bias - "vadd.f32 q15, q15, %q[bias]\n" // out2 add bias - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmax.f32 q14, q14, %q[zero]\n" // out1 -> relu - "vmax.f32 q15, q15, %q[zero]\n" // out2 -> relu - - "vst1.32 {d28-d29}, [%[out1]]\n" - "vst1.32 {d30-d31}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 7 - */ -void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - int hs = -1; - int he = 2; - float out_buf[4]; - for (int j = 0; j < h_out; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - if (hs == -1) { - dr0 = zeros; - } - if (he > h_in) { - dr2 = zeros; - } - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v9.16b, v11.16b, #12 \n" // v6 = - // {0,1,3,5} - "ext v7.16b, v9.16b, v13.16b, #12 \n" // v7 = - // {0,1,3,5} - "ext v8.16b, v9.16b, v15.16b, #12 \n" // v8 = - // {0,1,3,5} - - "fmul v4.4s, v10.4s, %[wr0].s[1] \n" // v10 * w01 - "fmul v5.4s, v11.4s, %[wr0].s[2] \n" // v11 * w02 - "fmul v6.4s, v6.4s, %[wr0].s[0] \n" // v6 * w00 - - "fmla v4.4s, v12.4s, %[wr1].s[1] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[2] \n" // v13 * w12 - "fmla v6.4s, v7.4s, %[wr1].s[0] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[1] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[2] \n" // v15 * w21 - "fmla v6.4s, v8.4s, %[wr2].s[0] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v6.4s \n" - - "fadd v4.4s, v4.4s, %[bias].4s \n" // out add bias - "fmax v4.4s, v4.4s, v9.4s \n" - - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q9, q11, #3 @ shift left 1 \n" // q6 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift left 1 \n" // q7 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift left 1 \n" // q8 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, " - "out0\n" // q10 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, " - "out0\n" // q11 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w00 - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu\n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - hs += 2; - he += 2; - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3s1.cc b/lite/backends/arm/math/conv_depthwise_3x3s1.cc deleted file mode 100644 index 8d0ebb58ad1b7e325bae3649b13914641021038f..0000000000000000000000000000000000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3s1.cc +++ /dev/null @@ -1,2539 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p0_bias(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx); - -void conv_depthwise_3x3s1p0_bias_s(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx); - -void conv_depthwise_3x3s1p1_bias(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx); - -void conv_depthwise_3x3s1p1_bias_s(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx); - -void conv_depthwise_3x3s1_fp32(const float *din, - float *dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float *weights, - const float *bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext *ctx) { - if (pad == 0) { - if (w_in > 5) { - conv_depthwise_3x3s1p0_bias(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - if (pad == 1) { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } -} - -#ifdef __aarch64__ -#define INIT_S1 \ - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" \ - "movi v21.4s, #0x0\n" /* out0 = 0 */ \ - \ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - -#define LEFT_COMPUTE_S1 \ - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */ \ - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/ \ - \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ \ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ \ - \ - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ \ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ \ - \ - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \ - \ - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */ \ - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/ \ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ \ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ \ - \ - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16=1234 */ \ - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ \ - \ - /* r2 */ \ - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ - \ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ - \ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - -#define LEFT_RESULT_S1 \ - /* r4 */ \ - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */ \ - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ - "cmp %w[cnt], #1 \n" \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "blt 3f \n" - -#define MID_COMPUTE_S1 \ - "1: \n" /* r0 */ \ - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - -#define MID_RESULT_S1 \ - /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ - \ - "subs %w[cnt], %w[cnt], #1 \n" \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "bne 1b \n" - -#define RIGHT_COMPUTE_S1 \ - "3: \n" \ - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" \ - "ld1 {v22.4s}, [%[doutr0]] \n" \ - "ld1 {v23.4s}, [%[doutr1]] \n" \ - "ld1 {v24.4s}, [%[doutr2]] \n" \ - "ld1 {v25.4s}, [%[doutr3]] \n" \ - \ - "bif v0.16b, %[vzero].16b, v18.16b \n" \ - "bif v1.16b, %[vzero].16b, v19.16b \n" \ - "bif v2.16b, %[vzero].16b, v18.16b \n" \ - "bif v3.16b, %[vzero].16b, v19.16b \n" \ - \ - "bif v4.16b, %[vzero].16b, v18.16b \n" \ - "bif v5.16b, %[vzero].16b, v19.16b \n" \ - "bif v6.16b, %[vzero].16b, v18.16b \n" \ - "bif v7.16b, %[vzero].16b, v19.16b \n" \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */ \ - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "bif v8.16b, %[vzero].16b, v18.16b \n" \ - "bif v9.16b, %[vzero].16b, v19.16b \n" \ - "bif v10.16b, %[vzero].16b, v18.16b \n" \ - "bif v11.16b, %[vzero].16b, v19.16b \n" \ - \ - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v18.4s}, [%[rmask]] \n" \ - \ - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - -#define RIGHT_RESULT_S1 \ - /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "bif v12.16b, v22.16b, v18.16b \n" \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "bif v13.16b, v23.16b, v18.16b \n" \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "bif v14.16b, v24.16b, v18.16b \n" \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "bif v15.16b, v25.16b, v18.16b \n" \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" - -#define LEFT_RESULT_S1_RELU \ - /* r4 */ \ - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ - \ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ - \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \ - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ - \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ - \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ - \ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ - "cmp %w[cnt], #1 \n" \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "blt 3f \n" - -#define MID_RESULT_S1_RELU \ - /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - \ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ - \ - /* r3 */ \ - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ - \ - "subs %w[cnt], %w[cnt], #1 \n" \ - \ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "bne 1b \n" - -#define RIGHT_RESULT_S1_RELU \ - /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "bif v12.16b, v22.16b, v18.16b \n" \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "bif v13.16b, v23.16b, v18.16b \n" \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ - \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \ - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "bif v14.16b, v24.16b, v18.16b \n" \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ - \ - "bif v15.16b, v25.16b, v18.16b \n" \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" - -#define COMPUTE_S_S1 \ - "prfm pldl1keep, [%[din0]]\n" \ - "prfm pldl1keep, [%[din1]]\n" \ - "prfm pldl1keep, [%[din2]]\n" \ - "prfm pldl1keep, [%[din3]]\n" \ - \ - "ld1 {v0.4s}, [%[din0]], #16\n" \ - "ld1 {v1.4s}, [%[din1]], #16\n" \ - "ld1 {v2.4s}, [%[din2]], #16\n" \ - "ld1 {v3.4s}, [%[din3]], #16\n" \ - \ - "bif v0.16b, %[zero].16b, %[mask].16b\n" \ - "bif v1.16b, %[zero].16b, %[mask].16b\n" \ - "bif v2.16b, %[zero].16b, %[mask].16b\n" \ - "bif v3.16b, %[zero].16b, %[mask].16b\n" \ - \ - "ext v4.16b, %[zero].16b, v0.16b, #12\n" \ - "ext v5.16b, %[zero].16b, v1.16b, #12\n" \ - "ext v6.16b, %[zero].16b, v2.16b, #12\n" \ - "ext v7.16b, %[zero].16b, v3.16b, #12\n" \ - \ - "ext v8.16b, v0.16b, %[zero].16b, #4\n" \ - "ext v9.16b, v1.16b, %[zero].16b, #4\n" \ - "ext v10.16b, v2.16b, %[zero].16b, #4\n" \ - "ext v11.16b, v3.16b, %[zero].16b, #4\n" \ - \ - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" \ - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" \ - \ - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" \ - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" \ - \ - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" \ - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" \ - \ - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" \ - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" \ - \ - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" \ - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" \ - \ - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" \ - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" \ - \ - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" \ - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ - \ - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" \ - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" \ - \ - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" \ - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" \ - \ - "fadd v12.4s, v12.4s, v14.4s\n" \ - "fadd v12.4s, v12.4s, v16.4s\n" \ - \ - "fadd v13.4s, v13.4s, v15.4s\n" \ - "fadd v13.4s, v13.4s, v17.4s\n" \ - \ - "fadd v12.4s, v12.4s, %[bias].4s\n" \ - "fadd v13.4s, v13.4s, %[bias].4s\n" - -#define RESULT_S_S1 \ - "prfm pldl1keep, [%[out1]]\n" \ - "prfm pldl1keep, [%[out2]]\n" \ - \ - "st1 {v12.4s}, [%[out1]]\n" \ - "st1 {v13.4s}, [%[out2]]\n" - -#define RESULT_S_S1_RELU \ - "prfm pldl1keep, [%[out1]]\n" \ - "prfm pldl1keep, [%[out2]]\n" \ - \ - "fmax v12.4s, v12.4s, %[zero].4s\n" \ - "fmax v13.4s, v13.4s, %[zero].4s\n" \ - \ - "st1 {v12.4s}, [%[out1]]\n" \ - "st1 {v13.4s}, [%[out2]]\n" - -#define COMPUTE_S_S1_P0 \ - "prfm pldl1keep, [%[din0]]\n" \ - "prfm pldl1keep, [%[din1]]\n" \ - "prfm pldl1keep, [%[din2]]\n" \ - "prfm pldl1keep, [%[din3]]\n" \ - \ - "ld1 {v0.4s, v1.4s}, [%[din0]]\n" \ - "ld1 {v2.4s, v3.4s}, [%[din1]]\n" \ - "ld1 {v4.4s, v5.4s}, [%[din2]]\n" \ - "ld1 {v6.4s, v7.4s}, [%[din3]]\n" \ - \ - "bif v0.16b, %[zero].16b, %[mask1].16b\n" \ - "bif v1.16b, %[zero].16b, %[mask2].16b\n" \ - \ - "bif v2.16b, %[zero].16b, %[mask1].16b\n" \ - "bif v3.16b, %[zero].16b, %[mask2].16b\n" \ - \ - "bif v4.16b, %[zero].16b, %[mask1].16b\n" \ - "bif v5.16b, %[zero].16b, %[mask2].16b\n" \ - \ - "bif v6.16b, %[zero].16b, %[mask1].16b\n" \ - "bif v7.16b, %[zero].16b, %[mask2].16b\n" \ - \ - "ext v8.16b, v0.16b, v1.16b, #4\n" \ - "ext v9.16b, v0.16b, v1.16b, #8\n" \ - \ - "and v12.16b, %[vbias].16b, %[vbias].16b \n" \ - "and v13.16b, %[vbias].16b, %[vbias].16b \n" /* r0 */ \ - "fmul v10.4s, v0.4s, %[wr0].s[0]\n" \ - "fmul v11.4s, v8.4s, %[wr0].s[1]\n" \ - "fmla v12.4s, v9.4s, %[wr0].s[2]\n" \ - \ - "ext v8.16b, v2.16b, v3.16b, #4\n" \ - "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */ \ - "fmul v14.4s, v2.4s, %[wr0].s[0]\n" \ - "fmla v10.4s, v2.4s, %[wr1].s[0]\n" \ - \ - "fmul v15.4s, v8.4s, %[wr0].s[1]\n" \ - "fmla v11.4s, v8.4s, %[wr1].s[1]\n" \ - \ - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ - "fmla v12.4s, v9.4s, %[wr1].s[2]\n" \ - \ - "ext v8.16b, v4.16b, v5.16b, #4\n" \ - "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */ \ - "fmla v14.4s, v4.4s, %[wr1].s[0]\n" \ - "fmla v10.4s, v4.4s, %[wr2].s[0]\n" \ - \ - "fmla v15.4s, v8.4s, %[wr1].s[1]\n" \ - "fmla v11.4s, v8.4s, %[wr2].s[1]\n" \ - \ - "fmla v13.4s, v9.4s, %[wr1].s[2]\n" \ - "fmla v12.4s, v9.4s, %[wr2].s[2]\n" \ - \ - "ext v8.16b, v6.16b, v7.16b, #4\n" \ - "ext v9.16b, v6.16b, v7.16b, #8\n" \ - \ - "fmla v14.4s, v6.4s, %[wr2].s[0]\n" \ - \ - "fmla v15.4s, v8.4s, %[wr2].s[1]\n" \ - \ - "fadd v12.4s, v12.4s, v10.4s\n" \ - \ - "fmla v13.4s, v9.4s, %[wr2].s[2]\n" \ - \ - "fadd v12.4s, v12.4s, v11.4s\n" \ - "fadd v13.4s, v13.4s, v14.4s\n" \ - "fadd v13.4s, v13.4s, v15.4s\n" // \ - // "prfm pldl1keep, [%[out1]]\n" \ - // "prfm pldl1keep, [%[out2]]\n" \ - // \ - // "st1 {v12.4s}, [%[out1]]\n" \ - // "st1 {v13.4s}, [%[out2]]\n" \ - - -#else -#define INIT_S1 \ - "pld [%[din0_ptr]] @ preload data\n" \ - "pld [%[din1_ptr]] @ preload data\n" \ - "pld [%[din2_ptr]] @ preload data\n" \ - "pld [%[din3_ptr]] @ preload data\n" \ - \ - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" \ - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" \ - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" \ - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" \ - \ - "vdup.32 q4, %[bias_val] @ and \n" \ - "vdup.32 q5, %[bias_val] @ and \n" - -#define LEFT_COMPUTE_S1 \ - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" \ - "vext.32 q7, q8, q9, #1 @ 1234\n" /* r0 */ \ - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" \ - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" \ - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" \ - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" \ - \ - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "pld [%[din0_ptr]] @ preload data\n" \ - "pld [%[din1_ptr]] @ preload data\n" \ - "pld [%[din2_ptr]] @ preload data\n" \ - "pld [%[din3_ptr]] @ preload data\n" \ - \ - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" \ - "vext.32 q7, q10, q11, #1 @ 1234\n" \ - \ - /* r1 */ \ - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" \ - "vext.32 q7, q12, q13, #1 @ 1234\n" \ - \ - /* r2 */ \ - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" \ - "vext.32 q7, q14, q15, #1 @ 1234\n" - -#define LEFT_RESULT_S1 \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ - "vdup.32 q4, %[bias_val] @ and \n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" \ - "cmp %[cnt], #1 @ check whether has mid cols\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ - \ - "vdup.32 q5, %[bias_val] @ and \n" \ - "blt 3f @ jump to main loop start point\n" - -#define MID_COMPUTE_S1 \ - "1: @ right pad entry\n" /* r0 */ \ - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "pld [%[din0_ptr]] @ preload data\n" \ - "pld [%[din1_ptr]] @ preload data\n" \ - "pld [%[din2_ptr]] @ preload data\n" \ - "pld [%[din3_ptr]] @ preload data\n" \ - \ - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ - \ - "vext.32 q6, q10, q11, #1 @ 1234\n" \ - "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q12, q13, #1 @ 1234\n" \ - "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q14, q15, #1 @ 1234\n" \ - "vext.32 q7, q14, q15, #2 @ 2345\n" - -#define MID_RESULT_S1 \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ - "vdup.32 q4, %[bias_val] @ and \n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ - \ - "subs %[cnt], #1 @ loop count minus 1\n" \ - \ - "vdup.32 q5, %[bias_val] @ and \n" \ - \ - "bne 1b @ jump to main loop start point\n" - -#define RIGHT_COMPUTE_S1 \ - "3: @ right pad entry\n" \ - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ - \ - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" \ - \ - "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vext.32 q6, q10, q11, #1 @ 1234\n" \ - "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" \ - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" \ - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q12, q13, #1 @ 1234\n" \ - "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q14, q15, #1 @ 1234\n" \ - "vext.32 q7, q14, q15, #2 @ 2345\n" - -#define RIGHT_RESULT_S1 \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ - "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ - "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" - -#define LEFT_RESULT_S1_RELU \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ - "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ - \ - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" \ - "vdup.32 q4, %[bias_val] @ and \n" \ - \ - "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ - \ - "cmp %[cnt], #1 @ check whether has mid cols\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ - \ - "vdup.32 q5, %[bias_val] @ and \n" \ - "blt 3f @ jump to main loop start point\n" - -#define MID_RESULT_S1_RELU \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ - "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ - \ - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" \ - "vdup.32 q4, %[bias_val] @ and \n" \ - \ - "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ - \ - "subs %[cnt], #1 @ loop count minus 1\n" \ - \ - "vdup.32 q5, %[bias_val] @ and \n" \ - \ - "bne 1b @ jump to main loop start point\n" - -#define RIGHT_RESULT_S1_RELU \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ - \ - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ - "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ - \ - "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ - "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" - -#define COMPUTE_S_S1 \ - "pld [%[din0]]\n" \ - "pld [%[din1]]\n" \ - "pld [%[din2]]\n" \ - "pld [%[din3]]\n" \ - \ - "vld1.32 {d12-d13}, [%[din0]]!\n" \ - "vld1.32 {d14-d15}, [%[din1]]!\n" \ - "vld1.32 {d16-d17}, [%[din2]]!\n" \ - "vld1.32 {d18-d19}, [%[din3]]!\n" \ - \ - "vbif q6, %q[vzero], %q[mask]\n" \ - "vbif q7, %q[vzero], %q[mask]\n" \ - "vbif q8, %q[vzero], %q[mask]\n" \ - "vbif q9, %q[vzero], %q[mask]\n" \ - \ - "vmul.f32 q14, q6, %e[wr0][1]\n" \ - "vmul.f32 q15, q7, %e[wr0][1]\n" \ - \ - "vmla.f32 q14, q7, %e[wr1][1]\n" \ - "vmla.f32 q15, q8, %e[wr1][1]\n" \ - \ - "vmla.f32 q14, q8, %e[wr2][1]\n" \ - "vmla.f32 q15, q9, %e[wr2][1]\n" \ - \ - "vext.32 q10, %q[vzero], q6, #3\n" \ - "vext.32 q11, %q[vzero], q7, #3\n" \ - "vext.32 q12, %q[vzero], q8, #3\n" \ - "vext.32 q13, %q[vzero], q9, #3\n" \ - \ - "vmla.f32 q14, q10, %e[wr0][0]\n" \ - "vmla.f32 q15, q11, %e[wr0][0]\n" \ - \ - "vmla.f32 q14, q11, %e[wr1][0]\n" \ - "vmla.f32 q15, q12, %e[wr1][0]\n" \ - \ - "vmla.f32 q14, q12, %e[wr2][0]\n" \ - "vmla.f32 q15, q13, %e[wr2][0]\n" \ - \ - "vext.32 q10, q6, %q[vzero], #1\n" \ - "vext.32 q11, q7, %q[vzero], #1\n" \ - "vext.32 q12, q8, %q[vzero], #1\n" \ - "vext.32 q13, q9, %q[vzero], #1\n" \ - \ - "vmla.f32 q14, q10, %f[wr0][0]\n" \ - "vmla.f32 q15, q11, %f[wr0][0]\n" \ - \ - "vmla.f32 q14, q11, %f[wr1][0]\n" \ - "vmla.f32 q15, q12, %f[wr1][0]\n" \ - \ - "vmla.f32 q14, q12, %f[wr2][0]\n" \ - "vmla.f32 q15, q13, %f[wr2][0]\n" \ - \ - "vadd.f32 q14, q14, %q[bias]\n" \ - "vadd.f32 q15, q15, %q[bias]\n" - -#define RESULT_S_S1 \ - "pld [%[out1]]\n" \ - "pld [%[out2]]\n" \ - \ - "vst1.32 {d28-d29}, [%[out1]]\n" \ - "vst1.32 {d30-d31}, [%[out2]]\n" - -#define RESULT_S_S1_RELU \ - "pld [%[out1]]\n" \ - "pld [%[out2]]\n" \ - \ - "vmax.f32 q14, q14, %q[vzero]\n" \ - "vmax.f32 q15, q15, %q[vzero]\n" \ - \ - "vst1.32 {d28-d29}, [%[out1]]\n" \ - "vst1.32 {d30-d31}, [%[out2]]\n" - -#define COMPUTE_S_S1_P0 \ - "pld [%[din0]]\n" \ - "pld [%[din1]]\n" \ - "pld [%[din2]]\n" \ - "pld [%[din3]]\n" \ - "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" \ - "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" \ - "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" \ - "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" \ - \ - "vdup.32 q4, %[bias_val] @ and \n" \ - "vdup.32 q5, %[bias_val] @ and \n" \ - \ - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ - \ - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ - \ - "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ - \ - "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ - \ - "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ - "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vext.32 q6, q10, q11, #1 @ 1234\n" \ - "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ - "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q12, q13, #1 @ 1234\n" \ - "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q14, q15, #1 @ 1234\n" \ - "vext.32 q7, q14, q15, #2 @ 2345\n" /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - "vadd.f32 q4, q4, q10 @ q4 += q10 \n" \ - \ - "pld [%[out1]]\n" \ - "pld [%[out2]]\n" \ - \ - "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - "vadd.f32 q14, q4, q11 @ q4 += q10 \n" \ - \ - "vadd.f32 q5, q5, q8 @ q4 += q10 \n" \ - "vadd.f32 q15, q5, q9 @ q4 += q10 \n" - -#endif -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -void conv_depthwise_3x3s1p1_bias(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float *zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float *write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - float *dout_ptr = dout_batch + c * size_out_channel; - - const float *din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float *wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float *doutr0 = dout_ptr; - float *doutr1 = doutr0 + w_out; - float *doutr2 = doutr1 + w_out; - float *doutr3 = doutr2 + w_out; - - const float *dr0 = din_ch_ptr; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - const float *dr4 = dr3 + w_in; - const float *dr5 = dr4 + w_in; - - const float *din_ptr0 = dr0; - const float *din_ptr1 = dr1; - const float *din_ptr2 = dr2; - const float *din_ptr3 = dr3; - const float *din_ptr4 = dr4; - const float *din_ptr5 = dr5; - float *ptr_zero = const_cast(zero); -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - if (flag_relu) { - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 - MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - } else { - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 - MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - } - dout_ptr = dout_ptr + 4 * w_out; - } -#else - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - - doutr0 = dout_ptr; - doutr1 = dout_ptr + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din_ptr1 = zero_ptr; - case 2: - din_ptr2 = zero_ptr; - case 1: - din_ptr3 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int *rmask_ptr = rmask; - unsigned int *vmask_ptr = vmask; - if (flag_relu) { - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 - MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 - MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - dout_ptr += 2 * w_out; - } //! end of processing mid rows -#endif - } - } -} - -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float *dout_channel = dout_batch + i * size_out_channel; - const float *din_channel = din_batch + i * size_in_channel; - const float *weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float *doutr0 = dout_channel; - float *doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float *dr0 = din_channel + hs * w_in; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - if (flag_relu) { - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); - } else { - asm volatile(COMPUTE_S_S1 RESULT_S_S1 - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); - } -#else - if (flag_relu) { - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(COMPUTE_S_S1 RESULT_S_S1 - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -void conv_depthwise_3x3s1p0_bias(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float *zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float *write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = w_out >> 2; - int remain = w_out % 4; - - unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); - const int remian_idx[4] = {0, 1, 2, 3}; - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - float *dout_ptr = dout_batch + c * size_out_channel; - - const float *din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float *wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float *doutr0 = dout_ptr; - float *doutr1 = doutr0 + w_out; - float *doutr2 = doutr1 + w_out; - float *doutr3 = doutr2 + w_out; - - const float *dr0 = din_ch_ptr; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - const float *dr4 = dr3 + w_in; - const float *dr5 = dr4 + w_in; - - const float *din_ptr0 = dr0; - const float *din_ptr1 = dr1; - const float *din_ptr2 = dr2; - const float *din_ptr3 = dr3; - const float *din_ptr4 = dr4; - const float *din_ptr5 = dr5; - - float *ptr_zero = const_cast(zero); -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 >= h_in) { - switch (i + 5 - h_in) { - case 4: - din_ptr1 = zero_ptr; - case 3: - din_ptr2 = zero_ptr; - case 2: - din_ptr3 = zero_ptr; - case 1: - din_ptr4 = zero_ptr; - case 0: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = tile_w; - if (flag_relu) { - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1_RELU - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - } else { - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1 - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1 "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - } - dout_ptr = dout_ptr + 4 * w_out; - } -#else - for (int i = 0; i < h_out; i += 2) { - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - - doutr0 = dout_ptr; - doutr1 = dout_ptr + w_out; - - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - //! process bottom pad - if (i + 3 >= h_in) { - switch (i + 3 - h_in) { - case 3: - din_ptr1 = zero_ptr; - case 2: - din_ptr2 = zero_ptr; - case 1: - din_ptr3 = zero_ptr; - case 0: - din_ptr3 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = tile_w; - unsigned int *rmask_ptr = rmask; - unsigned int *vmask_ptr = vmask; - if (flag_relu) { - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1_RELU - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1 - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1 "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - dout_ptr += 2 * w_out; - } //! end of processing mid rows -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p0_bias_s(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp1 = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); - uint32x4_t vmask_rp2 = - vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float *dout_channel = dout_batch + i * size_out_channel; - const float *din_channel = din_batch + i * size_in_channel; - const float *weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - -#ifdef __aarch64__ - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } -#endif // __aarch64__ - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - float *doutr0 = dout_channel; - float *doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_out; j += 2) { - const float *dr0 = din_channel + j * w_in; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - - doutr0 = dout_channel + j * w_out; - doutr1 = doutr0 + w_out; - - if (j + 3 >= h_in) { - switch (j + 3 - h_in) { - case 3: - dr1 = zero_ptr; - case 2: - dr2 = zero_ptr; - case 1: - dr3 = zero_ptr; - doutr1 = trash_buf; - case 0: - dr3 = zero_ptr; - doutr1 = trash_buf; - default: - break; - } - } -#ifdef __aarch64__ - if (flag_relu) { - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - } else { - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - } -#else - unsigned int *vmask_ptr = vmask; - float bias_val = flag_bias ? bias[i] : 0.f; - if (flag_relu) { - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3s2.cc b/lite/backends/arm/math/conv_depthwise_3x3s2.cc deleted file mode 100644 index ec039af98cb7e4fb037475dd4e5ee29204252165..0000000000000000000000000000000000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3s2.cc +++ /dev/null @@ -1,1862 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (pad == 0) { - if (w_in > 7) { - conv_depthwise_3x3s2p0_bias(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - if (pad == 1) { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } -} -#ifdef __aarch64__ -#define INIT_S2 \ - "prfm pldl1keep, [%[inptr0]] \n" \ - "prfm pldl1keep, [%[inptr1]] \n" \ - "prfm pldl1keep, [%[inptr2]] \n" \ - "prfm pldl1keep, [%[inptr3]] \n" \ - "prfm pldl1keep, [%[inptr4]] \n" \ - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" - -#define LEFT_COMPUTE_S2 \ - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" /* r0 */ \ - "fmul v11.4s, v0.4s, %[w0].s[1] \n" /* {0,2,4,6} * w01 */ \ - "fmul v12.4s, v1.4s, %[w0].s[2] \n" /* {1,3,5,7} * w02 */ \ - "fmla v16.4s, v10.4s, %[w0].s[0] \n" /* {0,1,3,5} * w00*/ \ - \ - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" /* v10 = {0,1,3,5} */ \ - \ - "sub %[inptr0], %[inptr0], #4 \n" \ - "sub %[inptr1], %[inptr1], #4 \n" /* r1 */ \ - "fmla v11.4s, v2.4s, %[w1].s[1] \n" \ - "fmla v12.4s, v3.4s, %[w1].s[2] \n" \ - "fmla v16.4s, v10.4s, %[w1].s[0] \n" \ - \ - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" \ - \ - "sub %[inptr2], %[inptr2], #4 \n" \ - "sub %[inptr3], %[inptr3], #4 \n" /* r2 */ \ - "fmul v13.4s, v4.4s, %[w0].s[1] \n" \ - "fmla v11.4s, v4.4s, %[w2].s[1] \n" \ - \ - "fmul v14.4s, v5.4s, %[w0].s[2] \n" \ - "fmla v12.4s, v5.4s, %[w2].s[2] \n" \ - \ - "fmla v17.4s, v10.4s, %[w0].s[0] \n" \ - "fmla v16.4s, v10.4s, %[w2].s[0] \n" \ - \ - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" \ - \ - "sub %[inptr4], %[inptr4], #4 \n" /* r3 */ \ - "fmla v13.4s, v6.4s, %[w1].s[1] \n" \ - "fmla v14.4s, v7.4s, %[w1].s[2] \n" \ - "fmla v17.4s, v10.4s, %[w1].s[0] \n" \ - \ - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" \ - "fadd v16.4s, v16.4s, v11.4s \n" \ - "fadd v16.4s, v16.4s, v12.4s \n" - -#define LEFT_RESULT_S2 \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - "ld1 {v15.4s}, [%[inptr0]] \n" \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "ld1 {v18.4s}, [%[inptr1]] \n" \ - "ld1 {v19.4s}, [%[inptr2]] \n" \ - \ - "ext v10.16b, v0.16b, v15.16b, #4 \n" \ - \ - "ld1 {v20.4s}, [%[inptr3]] \n" \ - "ld1 {v21.4s}, [%[inptr4]] \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - \ - "cmp %w[cnt], #1 \n" \ - \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "blt 1f \n" - -#define MID_COMPUTE_S2 \ - "2: \n" /* r0 */ \ - "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ - "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ - "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ - \ - "ext v10.16b, v2.16b, v18.16b, #4 \n" \ - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" /* r1 */ \ - "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ - "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ - "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ - \ - "ext v10.16b, v4.16b, v19.16b, #4 \n" \ - \ - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" /* r2 */ \ - "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ - "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ - \ - "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ - "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ - \ - "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ - "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ - \ - "ext v10.16b, v6.16b, v20.16b, #4 \n" \ - \ - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" /* r3 */ \ - "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ - "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ - \ - "ext v10.16b, v8.16b, v21.16b, #4 \n" \ - \ - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ - \ - "fadd v16.4s, v16.4s, v11.4s \n" \ - "fadd v16.4s, v16.4s, v12.4s \n" - -#define MID_RESULT_S2 \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ - \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - "ld1 {v15.4s}, [%[inptr0]] \n" \ - "ld1 {v18.4s}, [%[inptr1]] \n" \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "ld1 {v19.4s}, [%[inptr2]] \n" \ - "ld1 {v20.4s}, [%[inptr3]] \n" \ - "ld1 {v21.4s}, [%[inptr4]] \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "ext v10.16b, v0.16b, v15.16b, #4 \n" \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - "subs %w[cnt], %w[cnt], #1 \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "bne 2b \n" - -#define RIGHT_COMPUTE_S2 \ - "1: \n" \ - "cmp %w[remain], #1 \n" \ - "blt 4f \n" \ - "3: \n" \ - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" \ - \ - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" \ - \ - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" \ - \ - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" \ - \ - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" /* r0 */ \ - "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ - "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ - "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ - \ - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" \ - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" /* r1 */ \ - "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ - "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ - "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ - \ - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" /* r2 */ \ - "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ - "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ - \ - "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ - "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ - \ - "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ - "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ - \ - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" /* r3 */ \ - "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ - "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ - \ - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" \ - "ld1 {v0.4s}, [%[outptr0]] \n" \ - \ - "fadd v16.4s, v16.4s, v11.4s \n" \ - "fadd v16.4s, v16.4s, v12.4s \n" \ - "ld1 {v1.4s}, [%[outptr1]] \n" - -#define RIGHT_RESULT_S2 \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ - \ - "bif v16.16b, v0.16b, %[wmask].16b \n" \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "bif v17.16b, v1.16b, %[wmask].16b \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - "4: \n" - -#define LEFT_RESULT_S2_RELU \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ - \ - "fmax v16.4s, v16.4s, %[vzero].4s \n" \ - \ - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - "ld1 {v15.4s}, [%[inptr0]] \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "ld1 {v18.4s}, [%[inptr1]] \n" \ - "ld1 {v19.4s}, [%[inptr2]] \n" \ - \ - "ext v10.16b, v0.16b, v15.16b, #4 \n" \ - \ - "fmax v17.4s, v17.4s, %[vzero].4s \n" \ - \ - "ld1 {v20.4s}, [%[inptr3]] \n" \ - "ld1 {v21.4s}, [%[inptr4]] \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - \ - "cmp %w[cnt], #1 \n" \ - \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "blt 1f \n" - -#define MID_RESULT_S2_RELU \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ - \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - "ld1 {v15.4s}, [%[inptr0]] \n" \ - "ld1 {v18.4s}, [%[inptr1]] \n" \ - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "ld1 {v19.4s}, [%[inptr2]] \n" \ - "ld1 {v20.4s}, [%[inptr3]] \n" \ - "ld1 {v21.4s}, [%[inptr4]] \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "ext v10.16b, v0.16b, v15.16b, #4 \n" \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - "subs %w[cnt], %w[cnt], #1 \n" \ - \ - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "bne 2b \n" - -#define RIGHT_RESULT_S2_RELU \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ - \ - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "bif v16.16b, v0.16b, %[wmask].16b \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ - \ - "bif v17.16b, v1.16b, %[wmask].16b \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - "4: \n" - -#define COMPUTE_S_S2 \ - "movi v9.4s, #0 \n" \ - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ - \ - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ - \ - "bif v10.16b, v9.16b, v6.16b \n" \ - "bif v11.16b, v9.16b, v7.16b \n" \ - "bif v12.16b, v9.16b, v6.16b \n" \ - "bif v13.16b, v9.16b, v7.16b \n" \ - "bif v14.16b, v9.16b, v6.16b \n" \ - "bif v15.16b, v9.16b, v7.16b \n" \ - \ - "ext v6.16b, v9.16b, v11.16b, #12 \n" \ - "ext v7.16b, v9.16b, v13.16b, #12 \n" \ - "ext v8.16b, v9.16b, v15.16b, #12 \n" \ - \ - "fmul v4.4s, v10.4s, %[wr0].s[1] \n" \ - "fmul v5.4s, v11.4s, %[wr0].s[2] \n" \ - "fmul v6.4s, v6.4s, %[wr0].s[0] \n" \ - \ - "fmla v4.4s, v12.4s, %[wr1].s[1] \n" \ - "fmla v5.4s, v13.4s, %[wr1].s[2] \n" \ - "fmla v6.4s, v7.4s, %[wr1].s[0] \n" \ - \ - "fmla v4.4s, v14.4s, %[wr2].s[1] \n" \ - "fmla v5.4s, v15.4s, %[wr2].s[2] \n" \ - "fmla v6.4s, v8.4s, %[wr2].s[0] \n" \ - \ - "fadd v4.4s, v4.4s, v5.4s \n" \ - "fadd v4.4s, v4.4s, v6.4s \n" - -#define RESULT_S_S2 \ - "fadd v4.4s, v4.4s, %[bias].4s \n" \ - \ - "st1 {v4.4s}, [%[out]] \n" - -#define RESULT_S_S2_RELU \ - "fadd v4.4s, v4.4s, %[bias].4s \n" \ - "fmax v4.4s, v4.4s, v9.4s \n" \ - \ - "st1 {v4.4s}, [%[out]] \n" - -#define COMPUTE_S_S2_P0 \ - "movi v9.4s, #0 \n" \ - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ - \ - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ - "and v4.16b, %[bias].16b, %[bias].16b \n" \ - \ - "bif v10.16b, v9.16b, v6.16b \n" \ - "bif v11.16b, v9.16b, v7.16b \n" \ - "bif v12.16b, v9.16b, v6.16b \n" \ - "bif v13.16b, v9.16b, v7.16b \n" \ - "bif v14.16b, v9.16b, v6.16b \n" \ - "bif v15.16b, v9.16b, v7.16b \n" \ - \ - "ext v6.16b, v10.16b, v9.16b, #4 \n" \ - "ext v7.16b, v12.16b, v9.16b, #4 \n" \ - "ext v8.16b, v14.16b, v9.16b, #4 \n" \ - \ - "fmla v4.4s, v10.4s, %[wr0].s[0] \n" \ - "fmul v5.4s, v11.4s, %[wr0].s[1] \n" \ - "fmul v16.4s, v6.4s, %[wr0].s[2] \n" \ - \ - "fmla v4.4s, v12.4s, %[wr1].s[0] \n" \ - "fmla v5.4s, v13.4s, %[wr1].s[1] \n" \ - "fmla v16.4s, v7.4s, %[wr1].s[2] \n" \ - \ - "fmla v4.4s, v14.4s, %[wr2].s[0] \n" \ - "fmla v5.4s, v15.4s, %[wr2].s[1] \n" \ - "fmla v16.4s, v8.4s, %[wr2].s[2] \n" \ - \ - "fadd v4.4s, v4.4s, v5.4s \n" \ - "fadd v4.4s, v4.4s, v16.4s \n" - -#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]] \n" - -#define RESULT_S_S2_P0_RELU \ - "fmax v4.4s, v4.4s, v9.4s \n" \ - "st1 {v4.4s}, [%[out]] \n" - -#else -#define INIT_S2 \ - "vmov.u32 q9, #0 \n" \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ - "pld [%[din0_ptr]] @ preload data\n" \ - "pld [%[din1_ptr]] @ preload data\n" \ - "pld [%[din2_ptr]] @ preload data\n" \ - \ - "vdup.32 q3, %[bias] @ and \n" - -#define LEFT_COMPUTE_S2 \ - "vext.32 q6, q9, q11, #3 @ shift right 1 data\n" \ - "vext.32 q7, q9, q13, #3 @ shift right 1 data\n" \ - "vext.32 q8, q9, q15, #3 @ shift right 1 data\n" \ - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, out0\n" \ - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, out0\n" \ - \ - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" \ - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" \ - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" \ - \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ - \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, out1\n" \ - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, out1\n" \ - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, out1\n" \ - \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define LEFT_RESULT_S2 \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "cmp %[cnt], #1 \n" \ - "blt 1f \n" - -#define MID_COMPUTE_S2 \ - "2: \n" \ - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" \ - "vdup.32 q3, %[bias] @ and \n" \ - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" \ - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" \ - \ - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ - \ - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" \ - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" \ - \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ - \ - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" \ - \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ - \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define MID_RESULT_S2 \ - "subs %[cnt], #1 \n" \ - \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "bne 2b \n" - -#define RIGHT_COMPUTE_S2 \ - "1: \n" \ - "cmp %[remain], #1 \n" \ - "blt 3f \n" \ - \ - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ - "vdup.32 q3, %[bias] @ and \n" \ - \ - "vbif q10, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q11, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q12, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q13, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q14, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q15, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - \ - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ - \ - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ - \ - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" \ - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ - \ - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define RIGHT_RESULT_S2 \ - "vbif.f32 q3, q10, q11 @ write mask\n" \ - \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "3: \n" - -#define LEFT_RESULT_S2_RELU \ - "vmax.f32 q3, q3, q9 @ relu \n" \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "cmp %[cnt], #1 \n" \ - "blt 1f \n" - -#define MID_RESULT_S2_RELU \ - "vmax.f32 q3, q3, q9 @ relu \n" \ - "subs %[cnt], #1 \n" \ - \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "bne 2b \n" - -#define RIGHT_RESULT_S2_RELU \ - "vmax.f32 q3, q3, q9 @ relu \n" \ - "vbif.f32 q3, q10, q11 @ write mask\n" \ - \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "3: \n" - -#define COMPUTE_S_S2 \ - "vmov.u32 q9, #0 \n" \ - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ - "vdup.32 q3, %[bias] @ and \n" \ - \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ - \ - "vbif q10, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q11, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q12, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q13, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q14, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q15, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - \ - "vext.32 q6, q9, q11, #3 @ shift left 1 \n" \ - "vext.32 q7, q9, q13, #3 @ shift left 1 \n" \ - "vext.32 q8, q9, q15, #3 @ shift left 1 \n" \ - \ - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, out0\n" \ - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, out0\n" \ - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, out0\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, out0\n" \ - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, out0\n" \ - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, out0\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]] \n" - -#define RESULT_S_S2_RELU \ - "vmax.f32 q3, q3, q9 @ relu\n" \ - \ - "vst1.32 {d6-d7}, [%[out]] \n" - -#define COMPUTE_S_S2_P0 \ - "vmov.u32 q9, #0 \n" \ - "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" \ - "vdup.32 q3, %[bias] @ and \n" \ - \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ - \ - "vbif q10, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q11, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q12, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q13, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q14, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q15, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - \ - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ - "vext.32 q8, q14, q9, #1 @ shift left 1 \n" \ - \ - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ - "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, out0\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]] \n" - -#define RESULT_S_S2_P0_RELU \ - "vmax.f32 q3, q3, q9 @ relu \n" \ - "vst1.32 {d6-d7}, [%[out]] \n" - -#endif - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - * w_in > 7 - */ -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); -#ifdef __aarch64__ - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } -#else - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[i]; - } -#endif // __aarch64__ - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - if (flag_relu) { - asm volatile( - INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 - MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - } else { - asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 - MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - } - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - unsigned int* mask_ptr = dmask; - if (flag_relu) { - asm volatile( - INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 - MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 - MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - int hs = -1; - int he = 2; - float out_buf[4]; - for (int j = 0; j < h_out; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - if (hs == -1) { - dr0 = zeros; - } - if (he > h_in) { - dr2 = zeros; - } - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - if (flag_relu) { - asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - } else { - asm volatile(COMPUTE_S_S2 RESULT_S_S2 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - } -#else - if (flag_relu) { - asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(COMPUTE_S_S2 RESULT_S_S2 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - hs += 2; - he += 2; - } - } - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - */ -// w_in > 7 -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - - int tile_w = w_out >> 2; - int cnt_remain = w_out % 4; - - unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - -#ifdef __aarch64__ - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } -#else - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[i]; - } -#endif // __aarch64__ - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - dr0 = dr4; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i * 2 + 5 > h_in) { - switch (i * 2 + 5 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - case 0: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = tile_w; - if (flag_relu) { - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - MID_COMPUTE_S2 MID_RESULT_S2_RELU - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2_RELU - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - } else { - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - MID_COMPUTE_S2 MID_RESULT_S2 - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2 - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - } - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_out; i++) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - //! process bottom pad - if (i * 2 + 3 > h_in) { - switch (i * 2 + 3 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = tile_w; - unsigned int* mask_ptr = dmask; - if (flag_relu) { - asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU - RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - float out_buf[4]; - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - for (int j = 0; j < h_out; j++) { - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - if (j * 2 + 2 >= h_in) { - switch (j + 2 - h_in) { - case 1: - din1_ptr = zero_ptr; - case 0: - din2_ptr = zero_ptr; - default: - break; - } - } - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - if (flag_relu) { - asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - } else { - asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - } -#else - if (flag_relu) { - asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(dmask) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(dmask) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - } - } - } -} -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/reduce_prod.cc b/lite/backends/arm/math/reduce_prod.cc old mode 100755 new mode 100644 diff --git a/lite/backends/arm/math/reduce_prod.h b/lite/backends/arm/math/reduce_prod.h old mode 100755 new mode 100644 diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.cc b/lite/backends/arm/math/split_merge_lod_tenosr.cc old mode 100755 new mode 100644 diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.h b/lite/backends/arm/math/split_merge_lod_tenosr.h old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/dl_engine.cpp b/lite/backends/fpga/KD/dl_engine.cpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/crop_pe.cpp b/lite/backends/fpga/KD/pes/crop_pe.cpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/pes/gru_util.hpp b/lite/backends/fpga/KD/pes/gru_util.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc old mode 100644 new mode 100755 diff --git a/lite/backends/npu/builder.cc b/lite/backends/npu/builder.cc deleted file mode 100644 index 954fad8c916e152c5de06ce285b4ac17ecf22a01..0000000000000000000000000000000000000000 --- a/lite/backends/npu/builder.cc +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/npu/builder.h" -#include // NOLINT -#include -#include "lite/backends/npu/runtime.h" - -namespace paddle { -namespace lite { -namespace npu { - -// Build HIAI IR graph to om model, and store om model data into lite tensor -bool BuildModel(std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT - lite::Tensor* model_data) { - LOG(INFO) << "[NPU] Build model."; - CHECK_GT(inputs.size(), 0); - CHECK_GT(outputs.size(), 0); - CHECK_NE(model_data, 0); - // build IR graph to om model - ge::Graph ir_graph("graph"); - ir_graph.SetInputs(inputs).SetOutputs(outputs); - ge::Model om_model("model", "model"); - om_model.SetGraph(ir_graph); - domi::HiaiIrBuild ir_build; - domi::ModelBufferData om_model_buf; - if (!ir_build.CreateModelBuff(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] CreateModelBuff failed!"; - return false; - } - if (!ir_build.BuildIRModel(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] BuildIRModel failed!"; - return false; - } - // store om model into tensor - model_data->Resize({om_model_buf.length}); - memcpy(model_data->mutable_data(), - om_model_buf.data, - om_model_buf.length); - ir_build.ReleaseModelBuff(om_model_buf); - return true; -} - -std::string UniqueName(const std::string& prefix) { - static std::mutex counter_mtx; - static std::unordered_map counter_map; - std::unique_lock counter_lck(counter_mtx); - int counter = 1; - auto it = counter_map.find(prefix); - if (it == counter_map.end()) { - counter_map[prefix] = counter; - } else { - counter = ++(it->second); - } - return prefix + "_" + std::to_string(counter); -} - -ge::DataType CvtPrecisionType(PrecisionType itype) { - ge::DataType otype = ge::DT_FLOAT; - switch (itype) { - case PRECISION(kFloat): - otype = ge::DT_FLOAT; - break; - case PRECISION(kInt8): - otype = ge::DT_INT8; - break; - case PRECISION(kInt32): - otype = ge::DT_INT32; - break; - default: - LOG(FATAL) << "[NPU] Can not convert precision type(" - << PrecisionToStr(itype) << ") from Lite to NPU"; - break; - } - return otype; -} - -ge::Format CvtDataLayoutType(DataLayoutType itype) { - ge::Format otype = ge::FORMAT_NCHW; - switch (itype) { - case DATALAYOUT(kNCHW): - otype = ge::FORMAT_NCHW; - break; - // TODO(hong19860320) support more data layout type - default: - LOG(FATAL) << "[NPU] Can not convert data layout type(" - << DataLayoutToStr(itype) << ") from Lite to NPU"; - break; - } - return otype; -} - -ge::TensorPtr CvtTensor(lite::Tensor* in_tensor, - std::vector out_shape, - PrecisionType in_ptype, - DataLayoutType in_ltype) { - uint8_t* in_data = nullptr; - auto in_size = in_tensor->dims().production(); - auto in_shape = in_tensor->dims().Vectorize(); - if (out_shape.empty()) { - out_shape = in_shape; - } - int in_bytes; - if (in_ptype == PRECISION(kFloat)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(float); - } else if (in_ptype == PRECISION(kInt32)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int32_t); - } else if (in_ptype == PRECISION(kInt8)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int8_t); - } else { - LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype); - } - ge::DataType out_ptype = CvtPrecisionType(in_ptype); - ge::Format out_ltype = CvtDataLayoutType(in_ltype); - - ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype); - CHECK_EQ(out_ltype, ge::FORMAT_NCHW); - - auto out_size = out_desc.GetShape().GetShapeSize(); - CHECK_EQ(out_size, in_size); - - ge::TensorPtr out_tensor = std::make_shared(); - out_tensor->SetTensorDesc(out_desc); - out_tensor->SetData(in_data, in_bytes); - return out_tensor; -} - -int CvtActMode(std::string act_type) { - int act_mode = 1; - if (act_type == "sigmoid") { - act_mode = 0; - } else if (act_type == "relu") { - act_mode = 1; - } else if (act_type == "tanh") { - act_mode = 2; - } else if (act_type == "relu_clipped") { - act_mode = 3; - } else if (act_type == "elu") { - act_mode = 4; - } else if (act_type == "leaky_relu") { - act_mode = 5; - } else if (act_type == "abs") { - act_mode = 6; - } else if (act_type == "softsign") { - act_mode = 8; - } else if (act_type == "softplus") { - act_mode = 9; - } else if (act_type == "hard_sigmoid") { - act_mode = 10; - } else { - // TODO(hong19860320) support more activation mode - LOG(FATAL) << "[NPU] Unsupported activation type " << act_type; - } - return act_mode; -} - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname) { - auto iarg_names = op_info->input_argnames(); - if (std::find(iarg_names.begin(), iarg_names.end(), argname) != - iarg_names.end()) { - auto inputs = op_info->Input(argname); - if (inputs.empty()) { - return false; - } - auto var_name = inputs.front(); - auto var = scope->FindVar(var_name); - return var != nullptr; - } else { - return false; - } -} - -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h deleted file mode 100644 index 70200354fbab15f043a537300e92e2a26a3d739e..0000000000000000000000000000000000000000 --- a/lite/backends/npu/builder.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "ai_ddk_lib/include/hiai_ir_build.h" -#include "lite/core/op_lite.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" - -// Extended Ops of HIAI DDK -namespace ge { -/** - * Pads a tensor. - * - * x : the input tensor - * padding : the input tensor must be 2-D - * constant_values : constant values must be a scalar - * - * output : the output tensor - * - * t_paddings : Default DT_INT32 , t_paddings must be the same with - * datatype of the padding - * mode : 0: CONSTANT, 1: REFLECT, 2: SYMMETRIC - * T : datatype of constant_values DT_INT32:3 DT_FLOAT:0 - */ -REG_OP(Pad) - .INPUT(x, TensorType({DT_FLOAT, DT_INT32})) - .INPUT(padding, TensorType({DT_INT32})) - .OPTIONAL_INPUT(constant_values, TensorType({DT_INT32, DT_FLOAT})) - .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32})) - .ATTR(t_paddings, AttrValue::INT{3}) - .ATTR(mode, AttrValue::INT{0}) - .REQUIRED_ATTR(T, AttrValue::INT) - .OP_END(); - -} // namespace ge - -namespace paddle { -namespace lite { -namespace npu { - -class OpList { - public: - static OpList& Global() { - static thread_local OpList x; - return x; - } - void clear() { lists_.clear(); } - void add(std::shared_ptr p) { lists_.push_back(p); } - - private: - std::vector> lists_; -}; - -// Build HIAI IR graph to om model, and store om model data into lite tensor -bool BuildModel(std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT - lite::Tensor* model_data); - -std::string UniqueName(const std::string& prefix); - -ge::DataType CvtPrecisionType(PrecisionType itype); - -ge::Format CvtDataLayoutType(DataLayoutType itype); - -ge::TensorPtr CvtTensor(Tensor* in_tensor, - std::vector out_shape = {}, - PrecisionType in_ptype = PRECISION(kFloat), - DataLayoutType in_ltype = DATALAYOUT(kNCHW)); - -template -ge::TensorPtr CreateTensorAndFillData(std::vector data, - std::vector shape = {}, - ge::Format format = ge::FORMAT_NCHW) { - const std::type_info& info = typeid(T); - ge::DataType type = ge::DT_FLOAT; - if (info == typeid(float)) { - type = ge::DT_FLOAT; - } else if (info == typeid(int8_t)) { - type = ge::DT_INT8; - } else if (info == typeid(int32_t)) { - type = ge::DT_INT32; - } else { - LOG(FATAL) << "[NPU] Unknow value type " << info.name(); - } - if (shape.empty()) { - shape = {static_cast(data.size())}; - } else { - int size = 1; - for (auto i : shape) { - size *= i; - } - CHECK_EQ(data.size(), size); - } - ge::TensorDesc desc(ge::Shape(shape), format, type); - ge::TensorPtr tensor = std::make_shared(); - tensor->SetTensorDesc(desc); - tensor->SetData(reinterpret_cast(data.data()), - data.size() * sizeof(T)); - return tensor; -} - -template -ge::TensorPtr CreateTensorAndFillData(T value, - std::vector shape = {1}, - ge::Format format = ge::FORMAT_NCHW) { - int64_t size = 1; - for (auto i : shape) { - size *= i; - } - std::vector data(size, value); - return CreateTensorAndFillData(data, shape, format); -} - -int CvtActMode(std::string act_type); - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname); - -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc old mode 100755 new mode 100644 diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h old mode 100755 new mode 100644 index 3eba0b77e4bdeb26cdff869771645a5ce7637ae4..411600ae0a38e4ee1b4a3ce3d6519b927eeb0a1a --- a/lite/backends/npu/device.h +++ b/lite/backends/npu/device.h @@ -18,8 +18,8 @@ #include #include #include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "ai_ddk_lib/include/hiai_ir_build.h" +#include "HiAiModelManagerService.h" // NOLINT +#include "hiai_ir_build.h" // NOLINT namespace paddle { namespace lite { diff --git a/lite/backends/npu/runtime.cc b/lite/backends/npu/runtime.cc deleted file mode 100644 index 3485f63c7c8bb91081fd1969d0d41733417149d9..0000000000000000000000000000000000000000 --- a/lite/backends/npu/runtime.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/npu/runtime.h" -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace npu { - -// Create hiai model manager to load om model from lite tensor, and return the -// manager and an unique model name -bool LoadModel(const lite::Tensor &model_data, - std::shared_ptr *model_client, - std::string *model_name) { - LOG(INFO) << "[NPU] Load model."; - auto model_data_ptr = model_data.data(); - auto model_data_size = model_data.numel() * sizeof(int8_t); - if (model_data_ptr == nullptr || model_data_size == 0) { - return false; - } - *model_client = std::make_shared(); - int ret = (*model_client)->Init(nullptr); - if (ret != hiai::AI_SUCCESS) { - LOG(WARNING) << "[NPU] AiModelMngerClient init failed(" << ret << ")!"; - return false; - } - *model_name = "model.om"; - auto model_desc = std::make_shared( - *model_name, - DeviceInfo::Global().freq_level(), - DeviceInfo::Global().framework_type(), - DeviceInfo::Global().model_type(), - DeviceInfo::Global().device_type()); - model_desc->SetModelBuffer(model_data_ptr, model_data_size); - std::vector> model_descs; - model_descs.push_back(model_desc); - if ((*model_client)->Load(model_descs) != hiai::AI_SUCCESS) { - LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!"; - return false; - } - return true; -} - -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/runtime.h b/lite/backends/npu/runtime.h deleted file mode 100644 index 8b1ad51518d8626d9a6ecd6203a70b2637bb6004..0000000000000000000000000000000000000000 --- a/lite/backends/npu/runtime.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace npu { - -class DeviceInfo { - public: - static DeviceInfo &Global() { - static DeviceInfo x; - return x; - } - DeviceInfo() {} - - int freq_level() { return freq_level_; } - int framework_type() { return framework_type_; } - int model_type() { return model_type_; } - int device_type() { return device_type_; } - - private: - int freq_level_{3}; - int framework_type_{0}; - int model_type_{0}; - int device_type_{0}; -}; - -bool LoadModel(const lite::Tensor &model_data, - std::shared_ptr *model_client, - std::string *model_name); -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl old mode 100755 new mode 100644 diff --git a/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl old mode 100755 new mode 100644 diff --git a/lite/backends/x86/jit/README.en.md b/lite/backends/x86/jit/README.en.md index cd2aa5c242dba1a9be669a536cd9b614bf890e48..dc9eb4cf239155ba15a855c98e5515adb717d2d5 100644 --- a/lite/backends/x86/jit/README.en.md +++ b/lite/backends/x86/jit/README.en.md @@ -89,7 +89,7 @@ All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automati 3. Add reference function of `your_key`. Note: - this should be run on CPU and do not depend on any third-party. - - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used. + - Add `USE_JITKERNEL_REFER_LITE(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used. 4. Add unit test in `test.cc`, and verfiy at least `float` and `double`. Test more data type for some special functions if necessary, for example `int8`. 5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one. diff --git a/lite/backends/x86/jit/README.md b/lite/backends/x86/jit/README.md index 6998c5d867b079dfef69a71ca56e6f3fc30363d4..bc0e27234d05c82c9b0dcc431343d7db1a0f4067 100644 --- a/lite/backends/x86/jit/README.md +++ b/lite/backends/x86/jit/README.md @@ -79,7 +79,7 @@ PaddlePaddle/Paddle/paddle/fluid/ # 如何添加新的算子 1. 在`KernelType` 中添加 `your_key` 。 -2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。 +2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER_LITE(your_key)`来使用该kernel。 3. (optional) 实现更多的算法在`more`目录下,可以依赖mkl,intrinsic或者mkldnn等第三方库。 4. (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在与refer相同的`KernelType`上。 5. 添加新的`KernelTuple`,需要与`KernelType`一一对应,是所有类型的一个打包,包括数据类型,属性的类型,以及返回的函数类型。可以参考`SeqPoolTuple`,新加的Attr类型需要特例化`JitCodeKey`方法。 diff --git a/lite/backends/x86/jit/gen/CMakeLists.txt b/lite/backends/x86/jit/gen/CMakeLists.txt index 99244ea9bd919a018732b75d1ab811e8bf338516..62500775282d1c3d960f0fa9b00d3d4a2aef9390 100644 --- a/lite/backends/x86/jit/gen/CMakeLists.txt +++ b/lite/backends/x86/jit/gen/CMakeLists.txt @@ -4,33 +4,33 @@ file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) -function(USE_JITKERNEL_GEN TARGET) - file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n") +function(USE_JITKERNEL_GEN_LITE TARGET) + file(APPEND ${jit_file} "USE_JITKERNEL_GEN_LITE(${TARGET});\n") endfunction() # use gen jitcode kernel by name -USE_JITKERNEL_GEN(kMatMul) -USE_JITKERNEL_GEN(kVMul) -USE_JITKERNEL_GEN(kVAdd) -USE_JITKERNEL_GEN(kVSub) -USE_JITKERNEL_GEN(kVAddRelu) -USE_JITKERNEL_GEN(kVScal) -USE_JITKERNEL_GEN(kVAddBias) -USE_JITKERNEL_GEN(kVRelu) -USE_JITKERNEL_GEN(kVSquare) -USE_JITKERNEL_GEN(kVIdentity) -USE_JITKERNEL_GEN(kVExp) -USE_JITKERNEL_GEN(kVSigmoid) -USE_JITKERNEL_GEN(kVTanh) -USE_JITKERNEL_GEN(kLSTMCtHt) -USE_JITKERNEL_GEN(kLSTMC1H1) -USE_JITKERNEL_GEN(kGRUH1) -USE_JITKERNEL_GEN(kGRUHtPart1) -USE_JITKERNEL_GEN(kGRUHtPart2) -USE_JITKERNEL_GEN(kNCHW16CMulNC) -USE_JITKERNEL_GEN(kSeqPool) -USE_JITKERNEL_GEN(kHMax) -USE_JITKERNEL_GEN(kHSum) -USE_JITKERNEL_GEN(kEmbSeqPool) -USE_JITKERNEL_GEN(kSgd) -USE_JITKERNEL_GEN(kVBroadcast) +USE_JITKERNEL_GEN_LITE(kMatMul) +USE_JITKERNEL_GEN_LITE(kVMul) +USE_JITKERNEL_GEN_LITE(kVAdd) +USE_JITKERNEL_GEN_LITE(kVSub) +USE_JITKERNEL_GEN_LITE(kVAddRelu) +USE_JITKERNEL_GEN_LITE(kVScal) +USE_JITKERNEL_GEN_LITE(kVAddBias) +USE_JITKERNEL_GEN_LITE(kVRelu) +USE_JITKERNEL_GEN_LITE(kVSquare) +USE_JITKERNEL_GEN_LITE(kVIdentity) +USE_JITKERNEL_GEN_LITE(kVExp) +USE_JITKERNEL_GEN_LITE(kVSigmoid) +USE_JITKERNEL_GEN_LITE(kVTanh) +USE_JITKERNEL_GEN_LITE(kLSTMCtHt) +USE_JITKERNEL_GEN_LITE(kLSTMC1H1) +USE_JITKERNEL_GEN_LITE(kGRUH1) +USE_JITKERNEL_GEN_LITE(kGRUHtPart1) +USE_JITKERNEL_GEN_LITE(kGRUHtPart2) +USE_JITKERNEL_GEN_LITE(kNCHW16CMulNC) +USE_JITKERNEL_GEN_LITE(kSeqPool) +USE_JITKERNEL_GEN_LITE(kHMax) +USE_JITKERNEL_GEN_LITE(kHSum) +USE_JITKERNEL_GEN_LITE(kEmbSeqPool) +USE_JITKERNEL_GEN_LITE(kSgd) +USE_JITKERNEL_GEN_LITE(kVBroadcast) diff --git a/lite/backends/x86/jit/gen/act.cc b/lite/backends/x86/jit/gen/act.cc index f1f261c199d8d25997b1ce235aa99356834e43a8..45f4f7ddcce8e8864821712698c4496cf40b618c 100644 --- a/lite/backends/x86/jit/gen/act.cc +++ b/lite/backends/x86/jit/gen/act.cc @@ -156,9 +156,9 @@ size_t VTanhCreator::CodeSize(const int& d) const { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator); -REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator); -REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator); -REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator); -REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator); -REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator); +REGISTER_JITKERNEL_GEN_LITE(kVRelu, gen::VReluCreator); +REGISTER_JITKERNEL_GEN_LITE(kVSquare, gen::VSquareCreator); +REGISTER_JITKERNEL_GEN_LITE(kVIdentity, gen::VIdentityCreator); +REGISTER_JITKERNEL_GEN_LITE(kVExp, gen::VExpCreator); +REGISTER_JITKERNEL_GEN_LITE(kVSigmoid, gen::VSigmoidCreator); +REGISTER_JITKERNEL_GEN_LITE(kVTanh, gen::VTanhCreator); diff --git a/lite/backends/x86/jit/gen/blas.cc b/lite/backends/x86/jit/gen/blas.cc index 0bddea6ace7fd338d14da918516223bb17bafdbd..37183e66404dfae139a2bcd25c2855df119f939d 100644 --- a/lite/backends/x86/jit/gen/blas.cc +++ b/lite/backends/x86/jit/gen/blas.cc @@ -181,10 +181,10 @@ DECLARE_BLAS_CREATOR(VAddBias); namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator); -REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator); -REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator); -REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator); -REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator); -REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator); -REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator); +REGISTER_JITKERNEL_GEN_LITE(kVMul, gen::VMulCreator); +REGISTER_JITKERNEL_GEN_LITE(kVAdd, gen::VAddCreator); +REGISTER_JITKERNEL_GEN_LITE(kVSub, gen::VSubCreator); +REGISTER_JITKERNEL_GEN_LITE(kVAddRelu, gen::VAddReluCreator); +REGISTER_JITKERNEL_GEN_LITE(kVScal, gen::VScalCreator); +REGISTER_JITKERNEL_GEN_LITE(kVAddBias, gen::VAddBiasCreator); +REGISTER_JITKERNEL_GEN_LITE(kNCHW16CMulNC, gen::NCHW16CMulNCCreator); diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc index 2ff6894383f95699e4209215b0df3a84507a06b4..7e697014ed241a75693b783127633b255964f80b 100644 --- a/lite/backends/x86/jit/gen/embseqpool.cc +++ b/lite/backends/x86/jit/gen/embseqpool.cc @@ -145,4 +145,4 @@ class EmbSeqPoolCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator); +REGISTER_JITKERNEL_GEN_LITE(kEmbSeqPool, gen::EmbSeqPoolCreator); diff --git a/lite/backends/x86/jit/gen/gru.cc b/lite/backends/x86/jit/gen/gru.cc index c5737faf134287697ef49b88f10c2590da4cc07d..4c2c57413e30589de96385c34e09733458f66b7b 100644 --- a/lite/backends/x86/jit/gen/gru.cc +++ b/lite/backends/x86/jit/gen/gru.cc @@ -111,6 +111,6 @@ DECLARE_GRU_CREATOR(GRUHtPart2); namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator); -REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator); -REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator); +REGISTER_JITKERNEL_GEN_LITE(kGRUH1, gen::GRUH1Creator); +REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart1, gen::GRUHtPart1Creator); +REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart2, gen::GRUHtPart2Creator); diff --git a/lite/backends/x86/jit/gen/hopv.cc b/lite/backends/x86/jit/gen/hopv.cc index 4304dc48c5a084a747227bd4d4aedb1cec1775cd..0fdd63a7405647860416d43a86a7a7abe9fad760 100644 --- a/lite/backends/x86/jit/gen/hopv.cc +++ b/lite/backends/x86/jit/gen/hopv.cc @@ -99,5 +99,5 @@ DECLARE_HOP_CREATOR(HSum); namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator); -REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator); +REGISTER_JITKERNEL_GEN_LITE(kHMax, gen::HMaxCreator); +REGISTER_JITKERNEL_GEN_LITE(kHSum, gen::HSumCreator); diff --git a/lite/backends/x86/jit/gen/lstm.cc b/lite/backends/x86/jit/gen/lstm.cc index 44e58d0b75612238115d5771082d28c30cad55a2..e4417355202c6370563eadd80e5cb3da6af8cdc6 100644 --- a/lite/backends/x86/jit/gen/lstm.cc +++ b/lite/backends/x86/jit/gen/lstm.cc @@ -138,5 +138,5 @@ DECLARE_LSTM_CREATOR(LSTMC1H1); namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator); -REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator); +REGISTER_JITKERNEL_GEN_LITE(kLSTMCtHt, gen::LSTMCtHtCreator); +REGISTER_JITKERNEL_GEN_LITE(kLSTMC1H1, gen::LSTMC1H1Creator); diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc index 2c75f6dd5dc4bbf12513d10ef0a4e02e709135fd..010c80fac4842e74c9b8272db472ddf6cf954771 100644 --- a/lite/backends/x86/jit/gen/matmul.cc +++ b/lite/backends/x86/jit/gen/matmul.cc @@ -130,4 +130,4 @@ class MatMulCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator); +REGISTER_JITKERNEL_GEN_LITE(kMatMul, gen::MatMulCreator); diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc index e0cf5e5a5a7646f09666f6ccb35b18610c845317..4c80737aac4bc9cd09f4ff222c8fad8c441887ec 100644 --- a/lite/backends/x86/jit/gen/seqpool.cc +++ b/lite/backends/x86/jit/gen/seqpool.cc @@ -82,4 +82,4 @@ class SeqPoolCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator); +REGISTER_JITKERNEL_GEN_LITE(kSeqPool, gen::SeqPoolCreator); diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc index 10659f50844d73c14403f9e7a35d800364be1e7b..44e083366132c675b339b2da4bbb3b7c1c6b7569 100644 --- a/lite/backends/x86/jit/gen/sgd.cc +++ b/lite/backends/x86/jit/gen/sgd.cc @@ -127,4 +127,4 @@ class SgdCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator); +REGISTER_JITKERNEL_GEN_LITE(kSgd, gen::SgdCreator); diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc index 9e02dca8c40975fb45feed1d818bbe6d3e65db19..fb1e71f7b0b1e6f68a331d264682e80fbab7c219 100644 --- a/lite/backends/x86/jit/gen/vbroadcast.cc +++ b/lite/backends/x86/jit/gen/vbroadcast.cc @@ -88,4 +88,4 @@ class VBroadcastCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator); +REGISTER_JITKERNEL_GEN_LITE(kVBroadcast, gen::VBroadcastCreator); diff --git a/lite/backends/x86/jit/more/CMakeLists.txt b/lite/backends/x86/jit/more/CMakeLists.txt index 2ddbbcd16a3ffef560581592e3a009c61844d4d5..5641466d8a86e4be7b88d7eaf977e5a58d18f085 100644 --- a/lite/backends/x86/jit/more/CMakeLists.txt +++ b/lite/backends/x86/jit/more/CMakeLists.txt @@ -1,6 +1,6 @@ -function(USE_JITKERNEL_MORE TARGET TYPE) - file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n") +function(USE_JITKERNEL_MORE_LITE TARGET TYPE) + file(APPEND ${jit_file} "USE_JITKERNEL_MORE_LITE(${TARGET} ${TYPE});\n") endfunction() # enable it latter diff --git a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt index 468937a4f6b27ae525bfd0d8e99cc891eedbc353..80dabc72fbe2db46359cd69760eb5a02cea615af 100644 --- a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt +++ b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt @@ -5,5 +5,5 @@ cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) # use mkl kernels by name and type -USE_JITKERNEL_MORE(kCRFDecoding, intrinsic) -USE_JITKERNEL_MORE(kLayerNorm, intrinsic) +USE_JITKERNEL_MORE_LITE(kCRFDecoding, intrinsic) +USE_JITKERNEL_MORE_LITE(kLayerNorm, intrinsic) diff --git a/lite/backends/x86/jit/more/mix/CMakeLists.txt b/lite/backends/x86/jit/more/mix/CMakeLists.txt index dd039d29152961210958470a48f086a133ab640c..5e0238f26f1ebbd298dba0957bdc93e16671505f 100644 --- a/lite/backends/x86/jit/more/mix/CMakeLists.txt +++ b/lite/backends/x86/jit/more/mix/CMakeLists.txt @@ -5,11 +5,11 @@ cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE) -USE_JITKERNEL_MORE(kVSigmoid, mix) -USE_JITKERNEL_MORE(kVTanh, mix) -USE_JITKERNEL_MORE(kLSTMCtHt, mix) -USE_JITKERNEL_MORE(kLSTMC1H1, mix) -USE_JITKERNEL_MORE(kGRUH1, mix) -USE_JITKERNEL_MORE(kGRUHtPart1, mix) -USE_JITKERNEL_MORE(kGRUHtPart2, mix) -USE_JITKERNEL_MORE(kSoftmax, mix) +USE_JITKERNEL_MORE_LITE(kVSigmoid, mix) +USE_JITKERNEL_MORE_LITE(kVTanh, mix) +USE_JITKERNEL_MORE_LITE(kLSTMCtHt, mix) +USE_JITKERNEL_MORE_LITE(kLSTMC1H1, mix) +USE_JITKERNEL_MORE_LITE(kGRUH1, mix) +USE_JITKERNEL_MORE_LITE(kGRUHtPart1, mix) +USE_JITKERNEL_MORE_LITE(kGRUHtPart2, mix) +USE_JITKERNEL_MORE_LITE(kSoftmax, mix) diff --git a/lite/backends/x86/jit/more/mkl/CMakeLists.txt b/lite/backends/x86/jit/more/mkl/CMakeLists.txt index 56f1a62ad4e06807dace2a81156d92f6b02a14df..3557f531a561caace51225ad23e2d547ad48d08c 100644 --- a/lite/backends/x86/jit/more/mkl/CMakeLists.txt +++ b/lite/backends/x86/jit/more/mkl/CMakeLists.txt @@ -3,18 +3,18 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE) # use mkl kernels by name and type -USE_JITKERNEL_MORE(kMatMul, mkl) -USE_JITKERNEL_MORE(kVMul, mkl) -USE_JITKERNEL_MORE(kVAdd, mkl) -USE_JITKERNEL_MORE(kVScal, mkl) -USE_JITKERNEL_MORE(kStrideScal, mkl) -USE_JITKERNEL_MORE(kVExp, mkl) -USE_JITKERNEL_MORE(kVSquare, mkl) -USE_JITKERNEL_MORE(kVCopy, mkl) -USE_JITKERNEL_MORE(kVSigmoid, mkl) -USE_JITKERNEL_MORE(kVTanh, mkl) -USE_JITKERNEL_MORE(kSeqPool, mkl) -USE_JITKERNEL_MORE(kSoftmax, mkl) -USE_JITKERNEL_MORE(kEmbSeqPool, mkl) -USE_JITKERNEL_MORE(kSgd, mkl) -USE_JITKERNEL_MORE(kVBroadcast, mkl) +USE_JITKERNEL_MORE_LITE(kMatMul, mkl) +USE_JITKERNEL_MORE_LITE(kVMul, mkl) +USE_JITKERNEL_MORE_LITE(kVAdd, mkl) +USE_JITKERNEL_MORE_LITE(kVScal, mkl) +USE_JITKERNEL_MORE_LITE(kStrideScal, mkl) +USE_JITKERNEL_MORE_LITE(kVExp, mkl) +USE_JITKERNEL_MORE_LITE(kVSquare, mkl) +USE_JITKERNEL_MORE_LITE(kVCopy, mkl) +USE_JITKERNEL_MORE_LITE(kVSigmoid, mkl) +USE_JITKERNEL_MORE_LITE(kVTanh, mkl) +USE_JITKERNEL_MORE_LITE(kSeqPool, mkl) +USE_JITKERNEL_MORE_LITE(kSoftmax, mkl) +USE_JITKERNEL_MORE_LITE(kEmbSeqPool, mkl) +USE_JITKERNEL_MORE_LITE(kSgd, mkl) +USE_JITKERNEL_MORE_LITE(kVBroadcast, mkl) diff --git a/lite/backends/x86/jit/refer/CMakeLists.txt b/lite/backends/x86/jit/refer/CMakeLists.txt index 7133f596620410d37ffe52a2ee92b7a9974bf1cc..c52b21ad7dca102d18aee25aa60079bf03ae82b9 100644 --- a/lite/backends/x86/jit/refer/CMakeLists.txt +++ b/lite/backends/x86/jit/refer/CMakeLists.txt @@ -2,39 +2,39 @@ cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE) -function(USE_JITKERNEL_REFER TARGET) - file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n") +function(USE_JITKERNEL_REFER_LITE TARGET) + file(APPEND ${jit_file} "USE_JITKERNEL_REFER_LITE(${TARGET});\n") endfunction() # use refer kernel by name -USE_JITKERNEL_REFER(kVMul) -USE_JITKERNEL_REFER(kVAdd) -USE_JITKERNEL_REFER(kVAddRelu) -USE_JITKERNEL_REFER(kVSub) -USE_JITKERNEL_REFER(kVScal) -USE_JITKERNEL_REFER(kStrideScal) -USE_JITKERNEL_REFER(kVAddBias) -USE_JITKERNEL_REFER(kVCopy) -USE_JITKERNEL_REFER(kVRelu) -USE_JITKERNEL_REFER(kVIdentity) -USE_JITKERNEL_REFER(kVExp) -USE_JITKERNEL_REFER(kVSigmoid) -USE_JITKERNEL_REFER(kVTanh) -USE_JITKERNEL_REFER(kLSTMCtHt) -USE_JITKERNEL_REFER(kLSTMC1H1) -USE_JITKERNEL_REFER(kGRUH1) -USE_JITKERNEL_REFER(kGRUHtPart1) -USE_JITKERNEL_REFER(kGRUHtPart2) -USE_JITKERNEL_REFER(kCRFDecoding) -USE_JITKERNEL_REFER(kLayerNorm) -USE_JITKERNEL_REFER(kNCHW16CMulNC) -USE_JITKERNEL_REFER(kSeqPool) -USE_JITKERNEL_REFER(kMatMul) -USE_JITKERNEL_REFER(kVSquare) -USE_JITKERNEL_REFER(kHSum) -USE_JITKERNEL_REFER(kHMax) -USE_JITKERNEL_REFER(kStrideASum) -USE_JITKERNEL_REFER(kSoftmax) -USE_JITKERNEL_REFER(kEmbSeqPool) -USE_JITKERNEL_REFER(kSgd) -USE_JITKERNEL_REFER(kVBroadcast) +USE_JITKERNEL_REFER_LITE(kVMul) +USE_JITKERNEL_REFER_LITE(kVAdd) +USE_JITKERNEL_REFER_LITE(kVAddRelu) +USE_JITKERNEL_REFER_LITE(kVSub) +USE_JITKERNEL_REFER_LITE(kVScal) +USE_JITKERNEL_REFER_LITE(kStrideScal) +USE_JITKERNEL_REFER_LITE(kVAddBias) +USE_JITKERNEL_REFER_LITE(kVCopy) +USE_JITKERNEL_REFER_LITE(kVRelu) +USE_JITKERNEL_REFER_LITE(kVIdentity) +USE_JITKERNEL_REFER_LITE(kVExp) +USE_JITKERNEL_REFER_LITE(kVSigmoid) +USE_JITKERNEL_REFER_LITE(kVTanh) +USE_JITKERNEL_REFER_LITE(kLSTMCtHt) +USE_JITKERNEL_REFER_LITE(kLSTMC1H1) +USE_JITKERNEL_REFER_LITE(kGRUH1) +USE_JITKERNEL_REFER_LITE(kGRUHtPart1) +USE_JITKERNEL_REFER_LITE(kGRUHtPart2) +USE_JITKERNEL_REFER_LITE(kCRFDecoding) +USE_JITKERNEL_REFER_LITE(kLayerNorm) +USE_JITKERNEL_REFER_LITE(kNCHW16CMulNC) +USE_JITKERNEL_REFER_LITE(kSeqPool) +USE_JITKERNEL_REFER_LITE(kMatMul) +USE_JITKERNEL_REFER_LITE(kVSquare) +USE_JITKERNEL_REFER_LITE(kHSum) +USE_JITKERNEL_REFER_LITE(kHMax) +USE_JITKERNEL_REFER_LITE(kStrideASum) +USE_JITKERNEL_REFER_LITE(kSoftmax) +USE_JITKERNEL_REFER_LITE(kEmbSeqPool) +USE_JITKERNEL_REFER_LITE(kSgd) +USE_JITKERNEL_REFER_LITE(kVBroadcast) diff --git a/lite/backends/x86/jit/refer/refer.cc b/lite/backends/x86/jit/refer/refer.cc index e1b1240c5d5b0bc382fae8bd1b77f6c412522bdd..c47f8216abd999e66e914b208d96b8f352226f71 100644 --- a/lite/backends/x86/jit/refer/refer.cc +++ b/lite/backends/x86/jit/refer/refer.cc @@ -18,7 +18,7 @@ namespace refer = paddle::lite::jit::refer; #define REGISTER_REFER_KERNEL(func) \ - REGISTER_JITKERNEL_REFER( \ + REGISTER_JITKERNEL_REFER_LITE( \ k##func, refer::func##Kernel, refer::func##Kernel) REGISTER_REFER_KERNEL(VMul); diff --git a/lite/backends/x86/jit/registry.h b/lite/backends/x86/jit/registry.h index 7613a8dd4376045beb3636954668130e7220521e..65e3152d70fdd6262583cddced78e43513f0e0a1 100644 --- a/lite/backends/x86/jit/registry.h +++ b/lite/backends/x86/jit/registry.h @@ -77,16 +77,16 @@ class JitKernelRegistrar { void Touch() {} }; -#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ +#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg) \ struct __test_global_namespace_##uniq_name##__ {}; \ static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ __test_global_namespace_##uniq_name##__>::value, \ msg) // Refer always on CPUPlace -#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ +#define REGISTER_JITKERNEL_REFER_LITE(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_##kernel_type##_refer_CPUPlace, \ "REGISTER_KERNEL_REFER must be called in global namespace"); \ static ::paddle::lite::jit::JitKernelRegistrar< \ ::paddle::lite::jit::ReferKernelPool, \ @@ -94,84 +94,84 @@ class JitKernelRegistrar { __VA_ARGS__> \ __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ + int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ return 0; \ } // kernel_type: should be in paddle::lite::jit::KernelType // place_type: should be one of CPUPlace and GPUPlace in paddle::platform -#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ - "REGISTER_KERNEL_MORE must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ +#define REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, place_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type, \ + "REGISTER_KERNEL_MORE_LITE must be called in global namespace"); \ + extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \ - UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + UNUSED = LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ static ::paddle::lite::jit::JitKernelRegistrar< \ ::paddle::lite::jit::KernelPool, \ ::paddle::lite::fluid::place_type, \ __VA_ARGS__> \ __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_( \ ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ + int LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_ \ .Touch(); \ return 0; \ } #define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \ - REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) - -#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ - REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) - -#define REGISTER_JITKERNEL_GEN(kernel_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ - "REGISTER_JITKERNEL_GEN must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \ - TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static ::paddle::lite::jit::JitKernelRegistrar< \ - ::paddle::lite::jit::JitCodeCreatorPool, \ - ::paddle::lite::fluid::CPUPlace, \ - __VA_ARGS__> \ - __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \ - ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \ - __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \ - return 0; \ + REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) + +#define REGISTER_GPUKERNEL_MORE_LITE(kernel_type, impl_type, ...) \ + REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) + +#define REGISTER_JITKERNEL_GEN_LITE(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_gen_##kernel_type##_CPUPlace_, \ + "REGISTER_JITKERNEL_GEN_LITE must be called in global namespace"); \ + extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \ + LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static ::paddle::lite::jit::JitKernelRegistrar< \ + ::paddle::lite::jit::JitCodeCreatorPool, \ + ::paddle::lite::fluid::CPUPlace, \ + __VA_ARGS__> \ + __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \ + ::paddle::lite::jit::KernelType::kernel_type); \ + int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \ + __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \ + return 0; \ } -#define USE_JITKERNEL_GEN(kernel_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ - "USE_JITKERNEL_GEN must be called in global namespace"); \ - extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \ - static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \ - TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() - -#define USE_JITKERNEL_REFER(kernel_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_refer_CPUPlace_, \ - "USE_JITKERNEL_REFER must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \ - TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() - -#define USE_KERNEL_MORE(kernel_type, impl_type, place_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_, \ - "USE_JITKERNEL_MORE must be called in global namespace"); \ - extern int \ - TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \ - static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \ - UNUSED = \ - TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() - -#define USE_JITKERNEL_MORE(kernel_type, impl_type) \ - USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace) +#define USE_JITKERNEL_GEN_LITE(kernel_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_gen_##kernel_type##_CPUPlace_, \ + "USE_JITKERNEL_GEN_LITE must be called in global namespace"); \ + extern int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \ + static int use_litejitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \ + LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() + +#define USE_JITKERNEL_REFER_LITE(kernel_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_##kernel_type##_refer_CPUPlace_, \ + "USE_JITKERNEL_REFER_LITE must be called in global namespace"); \ + extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int use_litejitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \ + LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() + +#define USE_KERNEL_MORE_LITE(kernel_type, impl_type, place_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type##_, \ + "USE_JITKERNEL_MORE_LITE must be called in global namespace"); \ + extern int \ + LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \ + static int use_litejitkernel_##kernel_type##_##impl_type##_##place_type##_ \ + UNUSED = \ + LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() + +#define USE_JITKERNEL_MORE_LITE(kernel_type, impl_type) \ + USE_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace) } // namespace jit } // namespace lite diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h old mode 100755 new mode 100644 diff --git a/lite/backends/xpu/builder.cc b/lite/backends/xpu/builder.cc deleted file mode 100644 index 796eaf9c46ceb3d29f1ffdc4c86ac45509f07ba1..0000000000000000000000000000000000000000 --- a/lite/backends/xpu/builder.cc +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/xpu/builder.h" -#include // NOLINT -#include -#include "lite/backends/xpu/runtime.h" - -namespace paddle { -namespace lite { -namespace xpu { - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname) { - auto iarg_names = op_info->input_argnames(); - if (std::find(iarg_names.begin(), iarg_names.end(), argname) != - iarg_names.end()) { - auto inputs = op_info->Input(argname); - if (inputs.empty()) { - return false; - } - auto var_name = inputs.front(); - auto var = scope->FindVar(var_name); - return var != nullptr; - } else { - return false; - } -} - -std::string UniqueName(const std::string& prefix) { - static std::mutex counter_mtx; - static std::unordered_map counter_map; - std::unique_lock counter_lck(counter_mtx); - int counter = 1; - auto it = counter_map.find(prefix); - if (it == counter_map.end()) { - counter_map[prefix] = counter; - } else { - counter = ++(it->second); - } - return prefix + "_" + std::to_string(counter); -} - -xtcl::DataType CvtPrecisionType(PrecisionType in_type) { - xtcl::DataType out_type = ::xtcl::Float(32); - switch (in_type) { - case PRECISION(kFloat): - out_type = ::xtcl::Float(32); - break; - case PRECISION(kInt8): - out_type = ::xtcl::Int(8); - break; - case PRECISION(kInt32): - out_type = ::xtcl::Int(32); - break; - default: - LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type) - << ") from Lite to XPU"; - break; - } - return out_type; -} - -DLDataType CvtDataType(PrecisionType in_type) { - DLDataType out_type = {kDLFloat, 32, 1}; - switch (in_type) { - case PRECISION(kFloat): - out_type = {kDLFloat, 32, 1}; - break; - case PRECISION(kInt8): - out_type = {kDLInt, 8, 1}; - break; - case PRECISION(kInt32): - out_type = {kDLInt, 32, 1}; - break; - default: - LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type) - << ") from Lite to XPU"; - break; - } - return out_type; -} - -xtcl::Array CvtShape(const std::vector& in_shape) { - xtcl::Array out_shape; - for (auto dim : in_shape) { - out_shape.push_back(dim); - } - return out_shape; -} - -xtcl::Array CvtShape(const std::vector& in_shape) { - return CvtShape(std::vector(in_shape.begin(), in_shape.end())); -} - -xtcl::Array CvtShape(const DDim& in_dims) { - return CvtShape(in_dims.Vectorize()); -} - -std::shared_ptr CvtTensor(lite::Tensor* in_tensor, - std::vector out_shape, - PrecisionType in_ptype, - DataLayoutType in_ltype) { - uint8_t* in_data = nullptr; - auto in_size = in_tensor->dims().production(); - auto in_shape = in_tensor->dims().Vectorize(); - if (out_shape.empty()) { - out_shape = in_shape; - } - int in_bytes; - if (in_ptype == PRECISION(kFloat)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(float); - } else if (in_ptype == PRECISION(kInt32)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int32_t); - } else if (in_ptype == PRECISION(kInt8)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int8_t); - } else { - LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype); - } - auto out_tensor = std::make_shared( - xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0})); - auto out_data = - reinterpret_cast(out_tensor->ToDLPack()->dl_tensor.data); - std::memcpy(out_data, in_data, in_bytes); - return out_tensor; -} - -// Build the XPU subgraph to the XPU model, store the model data into the -// weight tensor of the graph op, and the model data will be loaded again -// by the graph computing kernel when the graph op is executed for inference. -// Due to the lack of XPU APIs for building and outputing the model data, -// the compiled XPU runtime object will be managed by the global variable -// 'DeviceInfo' and the key name for finding the runtime object will be -// stored in the weight tensor of graph op. -// TODO(hong19860320) Compile the XPU subgraph and output the compiled model -// data to the weight tensor of graph op. -bool BuildModel( - std::shared_ptr builder, - std::shared_ptr params, - std::vector>* outputs, - lite::Tensor* model) { - LOG(INFO) << "[XPU] Build Model."; - CHECK(builder != nullptr); - CHECK(outputs != nullptr); - CHECK_GT(outputs->size(), 0); - CHECK(model != nullptr); - - // build graph and fill all of constant params - xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0])); - auto target = xtcl::Target::Create("llvm"); - auto compiler = xtcl::network::xTensorCompiler(network, target); - compiler.SetParams(*params); // set the data of constant tensors - compiler.Build(); - - // create and register runtime - auto runtime = std::make_shared( - compiler.CreateRuntimeInstance()); - if (runtime == nullptr) { - LOG(WARNING) << "[XPU] Build Model failed!"; - return false; - } - std::string name = UniqueName("xpu"); - LOG(INFO) << "[XPU] Model Name: " << name; - DeviceInfo::Global().Insert(name, runtime); - model->Resize({static_cast(name.length() + 1)}); - memcpy(model->mutable_data(), - reinterpret_cast(name.c_str()), - name.length() + 1); - return true; -} - -} // namespace xpu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/xpu/builder.h b/lite/backends/xpu/builder.h deleted file mode 100644 index f0ac2b303aac7fa7f827e6e2f8f0fdf614b604b5..0000000000000000000000000000000000000000 --- a/lite/backends/xpu/builder.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include "lite/core/op_lite.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace xpu { - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname); - -std::string UniqueName(const std::string& prefix); - -xtcl::DataType CvtPrecisionType(PrecisionType in_type); - -DLDataType CvtDataType(PrecisionType in_type); - -xtcl::Array CvtShape(const std::vector& in_shape); - -xtcl::Array CvtShape(const std::vector& in_shape); - -xtcl::Array CvtShape(const DDim& in_dims); - -std::shared_ptr CvtTensor( - Tensor* in_tensor, - std::vector out_shape = {}, - PrecisionType in_ptype = PRECISION(kFloat), - DataLayoutType in_ltype = DATALAYOUT(kNCHW)); - -bool BuildModel( - std::shared_ptr builder, - std::shared_ptr params, - std::vector>* outputs, - lite::Tensor* model); - -} // namespace xpu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/xpu/device.cc b/lite/backends/xpu/device.cc old mode 100755 new mode 100644 index dbf88ff83302c38cdc1b266f6e5c829c5a5c1da1..badde878ad870bfc5fcd1984e39923174a11e9e2 --- a/lite/backends/xpu/device.cc +++ b/lite/backends/xpu/device.cc @@ -36,8 +36,11 @@ std::unique_ptr Device::Build( } xtcl::xNetwork network = builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs)); - auto target = xtcl::Target::Create(device_name_); - auto compiler = xtcl::network::xTensorCompiler(network, target); + auto target = xtcl::NullValue(); + if (!target_.empty()) { + target = xtcl::Target::Create(target_); + } + xtcl::network::xTensorCompiler compiler(network, target); compiler.SetParams(*params); // Set the data of constant tensors compiler.Build(); VLOG(3) << "[XPU] Build done"; diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h old mode 100755 new mode 100644 index bf9a8bf76af168a8a73f8f497b793df88f48f96b..6de18d5466da6e6b791363d2e275ea72376c78b8 --- a/lite/backends/xpu/device.h +++ b/lite/backends/xpu/device.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -30,7 +31,18 @@ class Device { static Device x; return x; } - Device() {} + Device() { + char* name = std::getenv("XPU_DEVICE_NAME"); + if (name) { + name_ = std::string(name); + } + // XPU_DEVICE_TARGET for XPU model building, which supports 'llvm' and 'xpu + // -libs=xdnn' + char* target = std::getenv("XPU_DEVICE_TARGET"); + if (target) { + target_ = std::string(target); + } + } // Build the XPU graph to the XPU runtime, return the XPU runtime which can be // used to run inference. @@ -39,10 +51,12 @@ class Device { xtcl::network::xTensorCompiler::ParamNDArrayMap* params, std::vector* outputs); + const std::string name() const { return name_; } + const std::string target() const { return target_; } + private: - // Keep reserved fields - int device_id_{0}; - std::string device_name_{"llvm"}; + std::string name_{""}; + std::string target_{""}; }; } // namespace xpu diff --git a/lite/backends/xpu/runtime.cc b/lite/backends/xpu/runtime.cc deleted file mode 100644 index a2c34b95758e8abf81c8294507d0ca60aad7c021..0000000000000000000000000000000000000000 --- a/lite/backends/xpu/runtime.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/xpu/runtime.h" -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace xpu { - -// Extract the model data and recover the XPU model for inference, the function -// is called by the graph computing kernel when the graph op is executed. -// Due to the lack of XPU APIs for loading and recovering the XPU model from -// memory, the key name is obtained from the weight tensor of graph op, to get -// the runtime object for inference from the global variable 'DeviceInfo'. -// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op. -bool LoadModel(const lite::Tensor &model, - std::shared_ptr *runtime) { - LOG(INFO) << "[XPU] Load Model."; - CHECK_GT(model.dims().production(), 0); - std::string name(reinterpret_cast(model.data())); - LOG(INFO) << "[XPU] Model Name: " << name; - CHECK(runtime != nullptr); - *runtime = DeviceInfo::Global().Find(name); - if (*runtime == nullptr) { - LOG(WARNING) << "[XPU] Load Model failed!"; - return false; - } - return true; -} - -} // namespace xpu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/xpu/runtime.h b/lite/backends/xpu/runtime.h deleted file mode 100644 index 4ff8d75bce6156d51a4988d427058da34460443f..0000000000000000000000000000000000000000 --- a/lite/backends/xpu/runtime.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace xpu { - -class DeviceInfo { - public: - static DeviceInfo& Global() { - static DeviceInfo x; - return x; - } - DeviceInfo() {} - - void Insert(const std::string& name, - std::shared_ptr runtime) { - if (runtimes_.find(name) != runtimes_.end()) { - LOG(WARNING) << "[XPU] Model " << name << " already exists."; - return; - } - runtimes_.emplace(std::make_pair(name, runtime)); - } - - void Clear() { runtimes_.clear(); } - - std::shared_ptr Find( - const std::string& name) const { - if (runtimes_.find(name) != runtimes_.end()) { - return runtimes_.at(name); - } else { - return nullptr; - } - } - - private: - int device_id_{0}; - std::string device_name_{"default"}; - std::unordered_map> - runtimes_; -}; - -bool LoadModel(const lite::Tensor& model, - std::shared_ptr* runtime); - -} // namespace xpu -} // namespace lite -} // namespace paddle diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 57f353c0ee5432bddec8cddc5a639c2f72ecf172..1d0558451fce67433d966d1f4bff82af26459e33 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -96,7 +96,15 @@ add_custom_command( add_custom_target(op_list_h DEPENDS ops.h) add_custom_target(kernel_list_h DEPENDS kernels.h) add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) - +# create headfile to restore ops info sorted by suppported platforms +add_custom_command( + COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py + ${kernels_src_list} + ${ops_src_list} + ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h + OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time. + ) + add_custom_target(supported_kernel_op_info_h DEPENDS supported_kernel_op_info.h) #----------------------------------------------- NOT CHANGE ----------------------------------------------- lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index d379b31b84f09f1e99742be52d58c3f0b1ee10f3..1c85353d5386fea1ae7f4a0f1869a95f8a2478af 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/framework.proto b/lite/core/framework.proto index 5adf2a18b98c2a2d3e2f6e8f7dd5688150674dc6..84b5502ff7b369452e7c9988d185450934c78b03 100644 --- a/lite/core/framework.proto +++ b/lite/core/framework.proto @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto2"; -option optimize_for = LITE_RUNTIME; package paddle.framework.proto; // Any incompatible changes to ProgramDesc and its dependencies should diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 86193235a2984b15a33c2eeaff15865d9f126eeb..18a1243c11652afc181f13f0f5a497858a30885f 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -83,14 +83,11 @@ class KernelBase { #if defined(LITE_WITH_CUDA) WorkSpace::Global_CUDA().AllocReset(); #endif - #ifdef LITE_WITH_PROFILE - CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. " - "When LITE_WITH_PROFILE is defined, please set a " - "Profiler for Instruction."; - profiler_->StartTiming(profile_id_, ctx_.get()); + profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); + profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); Run(); - profiler_->StopTiming(profile_id_, ctx_.get()); + profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); #else Run(); #endif diff --git a/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc b/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc index 97f6a2657f0f7ed8963529cdbec5aad00e763807..8447865bdc85f4e007d94d34be724cbe8329903b 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc @@ -35,5 +35,7 @@ void ElementwiseAddActivationFusePass::Apply( REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass, paddle::lite::mir::ElementwiseAddActivationFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kXPU), TARGET(kBM)}) + .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kBM)}) + .ExcludeTargets({TARGET(kX86)}) .BindKernel("fusion_elementwise_add_activation"); diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index 5b8e8563ba2e44c1c855cd3d4c6a9a08c06c826f..c85d34cbaecc63d3f6bb12a654e2ba0ea2a3232b 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -23,8 +23,13 @@ namespace lite { namespace mir { void FcFusePass::Apply(const std::unique_ptr& graph) { - fusion::FcFuser fuser; +#ifdef LITE_WITH_X86 + fusion::FcFuser fuser(true); fuser(graph.get()); +#endif + + fusion::FcFuser fuser2(false); + fuser2(graph.get()); } } // namespace mir @@ -33,5 +38,7 @@ void FcFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kXPU), TARGET(kBM)}) + .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kBM)}) + .ExcludeTargets({TARGET(kCUDA)}) .BindKernel("fc"); diff --git a/lite/core/mir/fusion/fc_fuse_pass_test.cc b/lite/core/mir/fusion/fc_fuse_pass_test.cc index f7aa4bb5adcb848531ecc3a8f63bace1c2e3e0ff..54260732c5efe788f0d3740197253fa2321a7d02 100644 --- a/lite/core/mir/fusion/fc_fuse_pass_test.cc +++ b/lite/core/mir/fusion/fc_fuse_pass_test.cc @@ -88,6 +88,7 @@ USE_LITE_OP(mul); USE_LITE_OP(elementwise_add); USE_LITE_OP(elementwise_sub); USE_LITE_OP(fc); +USE_LITE_OP(relu); USE_LITE_OP(feed); USE_LITE_OP(fetch); USE_LITE_OP(io_copy); diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc index 460c0fdf7a4309638b9852a315ca0efda02801ab..3c99131083d37ea2c8511ed136bff17c891529af 100644 --- a/lite/core/mir/fusion/fc_fuser.cc +++ b/lite/core/mir/fusion/fc_fuser.cc @@ -35,12 +35,23 @@ void FcFuser::BuildPattern() { std::vector mul_inputs{W, x}; std::vector add_inputs{mul_out, b}; mul_inputs >> *mul >> *mul_out; - add_inputs >> *add >> *Out; // Some op specialities. mul_out->AsIntermediate(); mul->AsIntermediate(); add->AsIntermediate(); + + if (with_relu_) { + auto* add_out = VarNode("add_out"); + auto* relu = OpNode("relu", "relu"); + std::vector relu_inputs{add_out}; + add_inputs >> *add >> *add_out; + relu_inputs >> *relu >> *Out; + add_out->AsIntermediate(); + relu->AsIntermediate(); + } else { + add_inputs >> *add >> *Out; + } } void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { @@ -71,6 +82,9 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) { op_desc.SetAttr( "in_num_col_dims", matched.at("mul")->stmt()->op_info()->GetAttr("x_num_col_dims")); + if (with_relu_) { + op_desc.SetAttr("activation_type", std::string{"relu"}); + } return op_desc; } diff --git a/lite/core/mir/fusion/fc_fuser.h b/lite/core/mir/fusion/fc_fuser.h index 7ba07527898c7e648c5f7f9151642ab0928fa496..6cb08f41574b67df1c78fa296d2d395771a66ee1 100644 --- a/lite/core/mir/fusion/fc_fuser.h +++ b/lite/core/mir/fusion/fc_fuser.h @@ -25,11 +25,13 @@ namespace fusion { class FcFuser : public FuseBase { public: + explicit FcFuser(bool with_relu) : with_relu_(with_relu) {} void BuildPattern() override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; private: cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + bool with_relu_; }; } // namespace fusion diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuser.cc b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuser.h b/lite/core/mir/fusion/sequence_pool_concat_fuser.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.h b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc index 9ad69b8152273628d70c796228f978e9f990ed9e..76c97d2da6ed9e7c6fc1f1889d80095278b68ec0 100644 --- a/lite/core/mir/generate_program_pass.cc +++ b/lite/core/mir/generate_program_pass.cc @@ -29,7 +29,6 @@ void GenerateProgramPass::Apply(const std::unique_ptr& graph) { if (item->IsStmt()) { auto& stmt = item->AsStmt(); VLOG(4) << stmt; - LOG(INFO) << stmt; insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); } } diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt index 1ac4ab346f15edf9e039d3143c0a301d49a1c0b4..f8aa09676c2d1e6d4df6fafbaf6a54bc69491acc 100644 --- a/lite/core/mir/subgraph/CMakeLists.txt +++ b/lite/core/mir/subgraph/CMakeLists.txt @@ -4,7 +4,7 @@ lite_cc_library(subgraph_detector lite_cc_library(subgraph_pass SRCS subgraph_pass.cc DEPS mir_pass types context ${mir_fusers} subgraph_detector) -if (WITH_TESTING) +if (WITH_TESTING AND NOT LITE_WITH_CUDA) lite_cc_test(test_subgraph_detector SRCS subgraph_detector_test.cc DEPS subgraph_detector mir_passes gflags model_parser cxx_api diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc old mode 100755 new mode 100644 index bf04d5c2ef2a7c000849b883bb6b09c400d28399..6d48b053a1a4140252d35e85d2351644d3c216e9 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -94,7 +94,7 @@ std::string SubgraphVisualizer::operator()() { } auto res = dot.Build(); - //std::cout << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl; + std::cout << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl; return res; } diff --git a/lite/core/mir/subgraph/subgraph_detector.h b/lite/core/mir/subgraph/subgraph_detector.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc old mode 100755 new mode 100644 index af5bcdee08273d84c21b68deffec8ffad765af66..116b3616814641dcd68ca56026cde10e8e1058d1 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -27,7 +27,7 @@ namespace mir { void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; -#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type); +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/npu/bridges/paddle_use_bridges.h" #undef USE_SUBGRAPH_BRIDGE auto teller = [&](Node* node) { @@ -41,7 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; -#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type); +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/xpu/bridges/paddle_use_bridges.h" #undef USE_SUBGRAPH_BRIDGE auto teller = [&](Node* node) { @@ -55,7 +55,7 @@ void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { void BMSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; -#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type); +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/bm/bridges/paddle_use_bridges.h" #undef USE_SUBGRAPH_BRIDGE auto teller = [&](Node* node) { diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc old mode 100755 new mode 100644 index 0d5fc7bf5e21e1d44cb62a507d17a3c7027573c2..a56c364f975fa6c3f82e1bbbb4489c93eb6ab724 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -92,7 +92,7 @@ void FillInputTensors( #define FILL_TENSOR_WITH_TYPE(type) \ auto input_tensor_data = input_tensor->mutable_data(); \ for (int j = 0; j < input_tensor_size; j++) { \ - input_tensor_data[i] = static_cast(value); \ + input_tensor_data[j] = static_cast(value); \ } for (int i = 0; i < input_tensor_shape.size(); i++) { auto input_tensor = predictor->GetInput(i); diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc index 78317f78ac6bf7024c1984c2127434d55b738ad6..f4d0e3c0afbe1f9df4e381a502e1800a3d58ba68 100644 --- a/lite/core/profile/profiler.cc +++ b/lite/core/profile/profiler.cc @@ -28,36 +28,55 @@ auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) { }; } -int Profiler::NewTimer(const OpCharacter& ch) { - StatisUnit unit; - unit.character = ch; +std::map TypeStr{ + {Type::kUnk, "Unknown"}, + {Type::kCreate, "Create"}, + {Type::kDispatch, "Dispatch"}, +}; + +StatisUnit::StatisUnit(const OpCharacter& ch) : character(ch) { + create_t.reset(new DeviceTimer()); if (ch.target == TargetType::kCUDA) { #ifdef LITE_WITH_CUDA - unit.timer.reset(new DeviceTimer()); + dispatch_t.reset(new DeviceTimer()); #else LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the " "default x86 timer is used instead."; #endif } else { - unit.timer.reset(new DeviceTimer()); + dispatch_t.reset(new DeviceTimer()); } +} + +lite::profile::Timer* StatisUnit::Timer(Type type) { + if (type == Type::kCreate) { + return create_t.get(); + } else if (type == Type::kDispatch) { + return dispatch_t.get(); + } + LOG(FATAL) << "Timer cannot be returned for unknown platforms."; + return nullptr; +} + +int Profiler::NewTimer(const OpCharacter& ch) { + StatisUnit unit(ch); units_.push_back(std::move(unit)); return units_.size() - 1; } -void Profiler::StartTiming(const int index, KernelContext* ctx) { +void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; - units_[index].timer->Start(ctx); + units_[index].Timer(type)->Start(ctx); } -float Profiler::StopTiming(const int index, KernelContext* ctx) { +float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; - return units_[index].timer->Stop(ctx); + return units_[index].Timer(type)->Stop(ctx); } -std::string Profiler::Summary(bool concise, size_t w) { +std::string Profiler::Summary(Type type, bool concise, size_t w) { using std::setw; using std::left; using std::fixed; @@ -65,12 +84,14 @@ std::string Profiler::Summary(bool concise, size_t w) { std::string title; // Title. if (concise) { - ss << "Timing cycle = " << units_.front().timer->LapTimes().Size() + ss << "Timing cycle = " << units_.front().Timer(type)->LapTimes().Size() << std::endl; - ss << "===== Concise Profiler Summary: " << name_ << ", Exclude " << w + ss << "===== Concise " << TypeStr.find(type)->second + << " Profiler Summary: " << name_ << ", Exclude " << w << " warm-ups =====" << std::endl; } else { - ss << "===== Detailed Profiler Summary: " << name_ << ", Exclude " << w + ss << "===== Detailed " << TypeStr.find(type)->second + << " Profiler Summary: " << name_ << ", Exclude " << w << " warm-ups =====" << std::endl; } ss << setw(25) << left << "Operator Type" @@ -84,16 +105,16 @@ std::string Profiler::Summary(bool concise, size_t w) { if (concise) { std::map summary(op_comp); for (auto& unit : units_) { - auto ch = summary.find(unit.character); + auto ch = summary.find(unit.Character()); if (ch != summary.end()) { - ch->second.avg += unit.timer->LapTimes().Avg(w); - ch->second.min += unit.timer->LapTimes().Min(w); - ch->second.max += unit.timer->LapTimes().Max(w); + ch->second.avg += unit.Timer(type)->LapTimes().Avg(w); + ch->second.min += unit.Timer(type)->LapTimes().Min(w); + ch->second.max += unit.Timer(type)->LapTimes().Max(w); } else { - TimeInfo info({unit.timer->LapTimes().Avg(w), - unit.timer->LapTimes().Min(w), - unit.timer->LapTimes().Max(w)}); - summary.insert({unit.character, info}); + TimeInfo info({unit.Timer(type)->LapTimes().Avg(w), + unit.Timer(type)->LapTimes().Min(w), + unit.Timer(type)->LapTimes().Max(w)}); + summary.insert({unit.Character(), info}); } } for (const auto& item : summary) { @@ -109,14 +130,15 @@ std::string Profiler::Summary(bool concise, size_t w) { } } else { for (auto& unit : units_) { + const auto& times = unit.Timer(type)->LapTimes(); // clang-format off - ss << setw(25) << left << fixed << unit.character.op_type \ - << " " << setw(40) << left << fixed << unit.character.kernel_name \ - << " " << setw(12) << left << fixed << unit.character.remark \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Avg(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Min(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Max(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Last(w) \ + ss << setw(25) << left << fixed << unit.Character().op_type \ + << " " << setw(40) << left << fixed << unit.Character().kernel_name \ + << " " << setw(12) << left << fixed << unit.Character().remark \ + << " " << setw(12) << left << fixed << times.Avg(w) \ + << " " << setw(12) << left << fixed << times.Min(w) \ + << " " << setw(12) << left << fixed << times.Max(w) \ + << " " << setw(12) << left << fixed << times.Last(w) \ << std::endl; // clang-format on } diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h index 4e9e9ae31c1a6d7f331eac2e77c4971986bd42a1..3933e5ba01ebcb20420494a955cbc0e202879f76 100644 --- a/lite/core/profile/profiler.h +++ b/lite/core/profile/profiler.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include @@ -22,6 +23,14 @@ namespace paddle { namespace lite { namespace profile { +enum class Type { + kUnk = 0, + kCreate, + kDispatch, +}; + +extern std::map TypeStr; + struct TimeInfo { float avg; float min; @@ -35,8 +44,15 @@ struct OpCharacter { std::string remark{std::string("N/A")}; }; -struct StatisUnit { - std::unique_ptr timer; +class StatisUnit final { + public: + explicit StatisUnit(const OpCharacter& ch); + lite::profile::Timer* Timer(Type type); + const OpCharacter& Character() const { return character; } + + protected: + std::unique_ptr create_t; + std::unique_ptr dispatch_t; OpCharacter character; }; @@ -45,9 +61,9 @@ class Profiler final { Profiler() = default; explicit Profiler(const std::string& name) : name_(name) {} int NewTimer(const OpCharacter& ch); - void StartTiming(const int index, KernelContext* ctx); - float StopTiming(const int index, KernelContext* ctx); - std::string Summary(bool concise = true, size_t warm_up = 10); + void StartTiming(Type type, const int index, KernelContext* ctx); + float StopTiming(Type type, const int index, KernelContext* ctx); + std::string Summary(Type type, bool concise = true, size_t warm_up = 10); private: std::string name_{std::string("N/A")}; diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc index 6f49698ef4a8f83e4192a16801566fdcbd7baf9a..3841f0151890d377a87f4f5d4b6d069ee75b560e 100644 --- a/lite/core/profile/test_timer.cc +++ b/lite/core/profile/test_timer.cc @@ -69,10 +69,10 @@ TEST(profiler, real_latency) { ch.op_type = "operator/1"; ch.kernel_name = "kernel/1"; int idx = profiler.NewTimer(ch); - profiler.StartTiming(idx, &ctx); + profiler.StartTiming(Type::kDispatch, idx, &ctx); std::this_thread::sleep_for(std::chrono::milliseconds(10)); - profiler.StopTiming(idx, &ctx); - std::cout << profiler.Summary(); + profiler.StopTiming(Type::kDispatch, idx, &ctx); + std::cout << profiler.Summary(Type::kDispatch); } #endif diff --git a/lite/core/program.cc b/lite/core/program.cc index 8dc8fb0dddc54d7d83b2368b31b5f30725469296..41d178f015d723aff739e608501e4619f8b10f5d 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -137,8 +137,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { void RuntimeProgram::Run() { for (auto& inst : instructions_) { - std::string op_type = inst.op()->op_info()->Type(); - if (op_type == "feed" || op_type == "fetch") continue; + if (inst.is_feed_fetch_op()) continue; inst.Run(); #ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE @@ -147,7 +146,7 @@ void RuntimeProgram::Run() { #endif // LITE_WITH_PROFILE } #ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(false, 0); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); #endif // LITE_WITH_PROFILE } @@ -252,8 +251,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) { } void Instruction::Run() { +#ifdef LITE_WITH_PROFILE + CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. " + "When LITE_WITH_PROFILE is defined, please set a " + "Profiler for Instruction."; + profiler_->StartTiming( + profile::Type::kCreate, profile_id_, kernel_->mutable_context()); +#endif CHECK(op_) << "op null"; CHECK(kernel_) << "kernel null"; + if (first_epoch_) { first_epoch_ = false; CHECK(op_->CheckShape()); @@ -263,10 +270,7 @@ void Instruction::Run() { return; } - // VLOG(4) << "kernel launch"; op_->InferShape(); - // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target " - // << TargetToStr(kernel_->target()); kernel_->Launch(); has_run_ = true; } diff --git a/lite/core/program.h b/lite/core/program.h index 291252619b396f18576b935a0189f4ecdba7867f..c845a17c52c0c565e339a13e093f3e8f59e8d4a7 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -90,7 +90,12 @@ struct Program { struct Instruction { Instruction(const std::shared_ptr& op, std::unique_ptr&& kernel) - : op_(op), kernel_(std::move(kernel)) {} + : op_(op), kernel_(std::move(kernel)) { + std::string op_type = op->Type(); + if (op_type == "feed" || op_type == "fetch") { + is_feed_fetch_op_ = true; + } + } // Run the instruction. void Run(); @@ -101,6 +106,8 @@ struct Instruction { const KernelBase* kernel() const { return kernel_.get(); } KernelBase* mutable_kernel() { return kernel_.get(); } + bool is_feed_fetch_op() const { return is_feed_fetch_op_; } + #ifdef LITE_WITH_PROFILE void set_profiler(profile::Profiler* profiler) { profiler_ = profiler; @@ -118,6 +125,7 @@ struct Instruction { private: std::shared_ptr op_; std::unique_ptr kernel_; + bool is_feed_fetch_op_{false}; bool first_epoch_{true}; bool has_run_{false}; @@ -143,7 +151,8 @@ class LITE_API RuntimeProgram { } ~RuntimeProgram() { #ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch); #endif // LITE_WITH_PROFILE } diff --git a/lite/core/tensor.h b/lite/core/tensor.h index de08aa82f327ebfb9c84b121f6d411dbbab24ff6..41a2d16f75f946c9ef8250d3e2af1ac6ee370d60 100644 --- a/lite/core/tensor.h +++ b/lite/core/tensor.h @@ -139,6 +139,22 @@ class TensorLite { // For other devices, T and R may be the same type. template R *mutable_data() { + auto type_id = typeid(T).hash_code(); + if (type_id == typeid(bool).hash_code()) { // NOLINT + precision_ = PrecisionType::kBool; + } else if (type_id == typeid(float).hash_code()) { // NOLINT + precision_ = PrecisionType::kFloat; + } else if (type_id == typeid(int8_t).hash_code()) { + precision_ = PrecisionType::kInt8; + } else if (type_id == typeid(int16_t).hash_code()) { + precision_ = PrecisionType::kInt16; + } else if (type_id == typeid(int32_t).hash_code()) { + precision_ = PrecisionType::kInt32; + } else if (type_id == typeid(int64_t).hash_code()) { + precision_ = PrecisionType::kInt64; + } else { + precision_ = PrecisionType::kUnk; + } memory_size_ = dims_.production() * sizeof(T); buffer_->ResetLazy(target_, memory_size_); return reinterpret_cast(static_cast(buffer_->data()) + @@ -163,10 +179,7 @@ class TensorLite { template R *mutable_data(TargetType target) { target_ = target; - memory_size_ = dims_.production() * sizeof(T); - buffer_->ResetLazy(target, memory_size()); - return reinterpret_cast(static_cast(buffer_->data()) + - offset_); + return mutable_data(); } void *mutable_data(size_t memory_size); void *mutable_data(TargetType target, size_t memory_size); diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md index 5e0ec49adda2c6f7372bdbba1fdd04b610b0a0bc..3217a7ed49006325715e22f8aa82d155bc8bf927 100644 --- a/lite/demo/cxx/README.md +++ b/lite/demo/cxx/README.md @@ -1,91 +1,111 @@ # C++ Demo -1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像 -2. 运行并进入docker镜像环境,执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。 -3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz ` -4. 执行以下命令准备模拟器环境 -```shell -# armv8 -adb kill-server -adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done -echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a" -echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 & -sleep 1m -``` -```shell -# armv7 -adb kill-server -adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done -echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a" -echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 & -sleep 1m -``` -5. 准备模型、编译并运行完整api的demo +1. 环境准备 + - 保证Android NDK在/opt目录下 + - 一台armv7或armv8架构的安卓手机 +2. 编译并运行全量api的demo(注:当编译模式为tiny_pubish时将不存在该demo) ```shell cd inference_lite_lib.android.armv8/demo/cxx/mobile_full wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz tar zxvf mobilenet_v1.tar.gz make -adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/ -adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/ -adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ -adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api -adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +adb push mobilenet_v1 /data/local/tmp/ +adb push mobilenetv1_full_api /data/local/tmp/ +adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobilenetv1_full_api +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" ``` 运行成功将在控制台输出预测结果的前10个类别的预测概率 -6. 编译并运行轻量级api的demo +3. 编译并运行轻量级api的demo ```shell cd ../mobile_light make -adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/ -adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ -adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api -adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +adb push mobilenetv1_light_api /data/local/tmp/ +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobilenetv1_light_api +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt" ``` +运行成功将在控制台输出预测结果的前10个类别的预测概率 -7. 编译并运行目标检测的demo +4. 编译并运行ssd目标检测的demo ```shell -cd ../mobile_detection +cd ../ssd_detection wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz tar zxvf mobilenetv1-ssd.tar.gz make -adb -s emulator-5554 push mobile_detection /data/local/tmp/ -adb -s emulator-5554 push test.jpg /data/local/tmp/ -adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ -adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection -adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && -/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" -adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./ +adb push ssd_detection /data/local/tmp/ +adb push test.jpg /data/local/tmp/ +adb push mobilenetv1-ssd /data/local/tmp +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/ssd_detection +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/ssd_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" +adb pull /data/local/tmp/test_ssd_detection_result.jpg ./ ``` -运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg +运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg -8. 编译并运行物体分类的demo +5. 编译并运行yolov3目标检测的demo +```shell +cd ../yolov3_detection +wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz +tar zxvf mobilenetv1-yolov3.tar.gz +make +adb push yolov3_detection /data/local/tmp/ +adb push test.jpg /data/local/tmp/ +adb push mobilenetv1-yolov3 /data/local/tmp +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/yolov3_detection +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/yolov3_detection /data/local/tmp/mobilenetv1-yolov3 /data/local/tmp/test.jpg" +adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./ +``` +运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg + +6. 编译并运行物体分类的demo ```shell cd ../mobile_classify wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz tar zxvf mobilenet_v1.tar.gz +./model_optimize_tool optimize model make + adb -s emulator-5554 push mobile_classify /data/local/tmp/ adb -s emulator-5554 push test.jpg /data/local/tmp/ adb -s emulator-5554 push labels.txt /data/local/tmp/ adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && -/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" +/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" ``` 运行成功将在控制台输出预测结果的前5个类别的预测概率 - 如若想看前10个类别的预测概率,在运行命令输入topk的值即可 eg: ```shell adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && - /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10" + /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10" ``` - 如若想看其他模型的分类结果, 在运行命令输入model_dir 及其model的输入大小即可 eg: ```shell adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && - /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224" + /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224" ``` +9. 编译含CV预处理库模型单测demo +```shell +cd ../test_cv +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +./model_optimize_tool optimize model +make +adb -s emulator-5554 push test_model_cv /data/local/tmp/ +adb -s emulator-5554 push test.jpg /data/local/tmp/ +adb -s emulator-5554 push labels.txt /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" +``` +运行成功将在控制台输出预测结果的前10个类别的预测概率 diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 old mode 100755 new mode 100644 diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 old mode 100755 new mode 100644 diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7 similarity index 90% rename from lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 rename to lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7 index 784ad73da4bf1d37ee23c17ac7c4dfc5c08f2627..05f1c2e276b9cc41cfd4e3f9b4c82790d844ba52 100644 --- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7 @@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY #CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) -mobile_detection: fetch_opencv mobile_detection.o - $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS) +ssd_detection: fetch_opencv ssd_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection $(CXX_LIBS) $(LDFLAGS) -mobile_detection.o: mobile_detection.cc - $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc +ssd_detection.o: ssd_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc fetch_opencv: @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} @@ -57,5 +57,5 @@ fetch_opencv: .PHONY: clean clean: - rm -f mobile_detection.o - rm -f mobile_detection + rm -f ssd_detection.o + rm -f ssd_detection diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8 similarity index 89% rename from lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 rename to lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8 index 2304b38efffdd96e7e13073020df4954b5e53034..77ff07df9541c554ac5fabf3cf56ee4a8904ea9c 100644 --- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8 @@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY #CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) -mobile_detection: fetch_opencv mobile_detection.o - $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS) +ssd_detection: fetch_opencv ssd_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection $(CXX_LIBS) $(LDFLAGS) -mobile_detection.o: mobile_detection.cc - $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc +ssd_detection.o: ssd_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc fetch_opencv: @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} @@ -57,5 +57,5 @@ fetch_opencv: .PHONY: clean clean: - rm -f mobile_detection.o - rm -f mobile_detection + rm -f ssd_detection.o + rm -f ssd_detection diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 new file mode 100644 index 0000000000000000000000000000000000000000..d659a316cd856fd550e83b125573409f239b8cf2 --- /dev/null +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 @@ -0,0 +1,71 @@ +ARM_ABI = arm7 +LITE_WITH_CV = ON +export ARM_ABI +export LITE_WITH_CV + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + +test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + +test_img_prepross: fetch_opencv test_img_prepross.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross $(CXX_LIBS) $(LDFLAGS) + +test_img_prepross.o: test_img_prepross.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f test_model_cv.o + rm -f test_model_cv + rm -f test_img_prepross.o + rm -f test_img_prepross diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 new file mode 100644 index 0000000000000000000000000000000000000000..c80b07d5c029a3624a514e07375fd08e8770da25 --- /dev/null +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 @@ -0,0 +1,70 @@ +ARM_ABI = arm8 +LITE_WITH_CV = ON +export ARM_ABI +export LITE_WITH_CV + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + +test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + +test_img_prepross: fetch_opencv test_img_prepross.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross $(CXX_LIBS) $(LDFLAGS) + +test_img_prepross.o: test_img_prepross.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f test_model_cv.o + rm -f test_model_cv + rm -f test_img_prepross.o + rm -f test_img_prepross diff --git a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 new file mode 100644 index 0000000000000000000000000000000000000000..b584f5623594fd64f10a86766828c62cdfe08aef --- /dev/null +++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 @@ -0,0 +1,61 @@ +ARM_ABI = arm7 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +yolov3_detection: fetch_opencv yolov3_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection $(CXX_LIBS) $(LDFLAGS) + +yolov3_detection.o: yolov3_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f yolov3_detection.o + rm -f yolov3_detection diff --git a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 new file mode 100644 index 0000000000000000000000000000000000000000..27779817012bce527d4506a0dcd377bf4ced3c1a --- /dev/null +++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 @@ -0,0 +1,61 @@ +ARM_ABI = arm8 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +yolov3_detection: fetch_opencv yolov3_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection $(CXX_LIBS) $(LDFLAGS) + +yolov3_detection.o: yolov3_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f yolov3_detection.o + rm -f yolov3_detection diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc old mode 100755 new mode 100644 index c651bf9f4cca0db0e126311e5a03b3ade6ccf886..d0cf59e185e1330b7d8487d562afa0af29236007 --- a/lite/demo/cxx/mobile_classify/mobile_classify.cc +++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc @@ -117,7 +117,7 @@ void pre_process(const cv::Mat& img, float* means, float* scales) { cv::Mat rgb_img; - // cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); cv::Mat imgf; rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); diff --git a/lite/demo/cxx/mobile_detection/test.jpg b/lite/demo/cxx/mobile_detection/test.jpg deleted file mode 100644 index 6bb36e136deec6088c7b75215fc35d6231283673..0000000000000000000000000000000000000000 Binary files a/lite/demo/cxx/mobile_detection/test.jpg and /dev/null differ diff --git a/lite/demo/cxx/mobile_detection/mobile_detection.cc b/lite/demo/cxx/ssd_detection/ssd_detection.cc similarity index 98% rename from lite/demo/cxx/mobile_detection/mobile_detection.cc rename to lite/demo/cxx/ssd_detection/ssd_detection.cc index 9b8f02aeedef991496541400e7db67c3e3ff0e51..011733eb87f551141c52ab8e23d9625c93c742fc 100644 --- a/lite/demo/cxx/mobile_detection/mobile_detection.cc +++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc @@ -194,7 +194,7 @@ void RunModel(std::string model_dir, std::string img_path) { } auto rec_out = detect_object(outptr, static_cast(cnt / 6), 0.6f, img); std::string result_name = - img_path.substr(0, img_path.find(".")) + "_detection_result.jpg"; + img_path.substr(0, img_path.find(".")) + "_ssd_detection_result.jpg"; cv::imwrite(result_name, img); } diff --git a/lite/demo/cxx/test_cv/README.md b/lite/demo/cxx/test_cv/README.md new file mode 100644 index 0000000000000000000000000000000000000000..36d2985a4fd4f243027f8caab9b6c5a8beb94cad --- /dev/null +++ b/lite/demo/cxx/test_cv/README.md @@ -0,0 +1,131 @@ +# 图像预测库的使用 +1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish模式 +example: +```shell +set BUILD_WITH_CV=ON or LITE_WITH_CV=ON +./lite/tools/build.sh +--arm_os=android +--arm_abi=armv8 +--arm_lang=gcc +--android_stl=c++_static +full_publish +``` + +2. 准备模型和优化模型 +example: +```shell +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +./lite/tools/build.sh build_optimize_tool +./build.model_optimize_tool/lite/api/model_optimize_tool +--optimize_out_type=naive_buffer +--optimize_out=model_dir +--model_dir=model_dir +--prefer_int8_kernel=false +``` + +3. 编译并运行完整test_model_cv demo +example: +```shell +cd inference_lite_lib.android.armv8/demo/cxx/test_cv +``` + +- 修改MakeFile, 注释编译test_img_propress 语句 + ```shell + test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + + test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + + #test_img_propress: fetch_opencv test_img_propress.o + # $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress $(CXX_LIBS) $(LDFLAGS) + + #test_img_propress.o: test_img_propress.cc + # $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc + + .PHONY: clean + clean: + rm -f test_model_cv.o + rm -f test_model_cv + #rm -f test_img_propress.o + #rm -f test_img_propress + ``` +- 修改../../..//cxx/include/paddle_image_preprocess.h, 修改paddle_api.h头文件的路径 + ```shell + origin: + #include "lite/api/paddle_api.h" + #include "lite/api/paddle_place.h" + now: + #include "paddle_api.h" + #include "paddle_place.h" + ``` +- 测试模型必须是优化后的模型 + +```shell +make + +adb -s device_id push mobilenet_v1 /data/local/tmp/ +adb -s device_id push test_model_cv /data/local/tmp/ +adb -s device_id push test.jpg /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id shell chmod +x /data/local/tmp/test_model_cv +adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 " +``` +运行成功将在控制台输出部分预测结果 + +4. 编译并运行完整test_img_preprocess demo +example: +```shell +cd inference_lite_lib.android.armv8/demo/cxx/test_cv +``` + +- 修改MakeFile, 注释编译test_model_cv 语句 + ```shell + #test_model_cv: fetch_opencv test_model_cv.o + # $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + + #test_model_cv.o: test_model_cv.cc + # $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + + test_img_propress: fetch_opencv test_img_propress.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress $(CXX_LIBS) $(LDFLAGS) + + test_img_propress.o: test_img_propress.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc + + .PHONY: clean + clean: + #rm -f test_model_cv.o + #rm -f test_model_cv + rm -f test_img_propress.o + rm -f test_img_propress + ``` +- 修改../../..//cxx/include/paddle_image_preprocess.h, 修改paddle_api.h头文件的路径 + ```shell + origin: + #include "lite/api/paddle_api.h" + #include "lite/api/paddle_place.h" + now: + #include "paddle_api.h" + #include "paddle_place.h" + ``` +- 测试模型必须是优化后的模型 + +```shell +make + +adb -s device_id push mobilenet_v1 /data/local/tmp/ +adb -s device_id push test_img_propress /data/local/tmp/ +adb -s device_id push test.jpg /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id shell chmod +x /data/local/tmp/test_model_cv +adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1 " +adb -s device_id pull /data/local/tmp/resize.jpg ./ +adb -s device_id pull /data/local/tmp/convert.jpg ./ +adb -s device_id pull /data/local/tmp/flip.jpg ./ +adb -s device_id pull /data/local/tmp/rotate.jpg ./ +``` +运行成功将在控制台输出OpenCV 和 Padlle-lite的耗时;同时,将在test_cv目录下看到生成的图像预处理结果图: 如:resize.jpg、convert.jpg等 diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc new file mode 100644 index 0000000000000000000000000000000000000000..c2cbd66cc0a15a1032141641d83fbf8db85d20bf --- /dev/null +++ b/lite/demo/cxx/test_cv/test_img_prepross.cc @@ -0,0 +1,389 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT +#include "paddle_image_preprocess.h" // NOLINT +#include "time.h" // NOLINT +typedef paddle::lite_api::Tensor Tensor; +typedef paddle::lite::utils::cv::ImageFormat ImageFormat; +typedef paddle::lite::utils::cv::FlipParam FlipParam; +typedef paddle::lite::utils::cv::TransParam TransParam; +typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; +typedef paddle::lite_api::DataLayoutType LayoutType; +using namespace paddle::lite_api; // NOLINT + +void fill_with_mat(cv::Mat& mat, uint8_t* src) { // NOLINT + for (int i = 0; i < mat.rows; i++) { + for (int j = 0; j < mat.cols; j++) { + int tmp = (i * mat.cols + j) * 3; + cv::Vec3b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + } + } +} +void test_img(std::vector cluster_id, + std::vector thread_num, + std::string img_path, + std::string dst_path, + ImageFormat srcFormat, + ImageFormat dstFormat, + int width, + int height, + float rotate, + FlipParam flip, + LayoutType layout, + std::string model_dir, + int test_iter = 1) { + // init + // paddle::lite::DeviceInfo::Init(); + // read img and pre-process + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + int srch = img.rows; + int srcw = img.cols; + for (auto& cls : cluster_id) { + for (auto& th : thread_num) { + std::cout << "cluster: " << cls << ", threads: " << th << std::endl; + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode((PowerMode)cls); + config.set_threads(th); + std::cout << "model: " << model_dir; + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data from image + std::unique_ptr input_tensor(predictor->GetInput(0)); + + /* + imread(img_path, param) + IMREAD_UNCHANGED(<0) 表示加载原图,不做任何改变 + IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来 + IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来 + */ + cv::Mat img; + if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + img = imread(img_path, cv::IMREAD_COLOR); + } else if (srcFormat == ImageFormat::GRAY) { + img = imread(img_path, cv::IMREAD_GRAYSCALE); + } else { + printf("this format %d does not support \n", srcFormat); + return; + } + if (img.empty()) { + std::cout << "opencv read image " << img_path.c_str() << " failed" + << std::endl; + return; + } + int srch = img.rows; + int srcw = img.cols; + int dsth = height; + int dstw = width; + + std::cout << " input tensor size, num= " << 1 << ", channel= " << 1 + << ", height= " << srch << ", width= " << srcw + << ", srcFormat= " << (ImageFormat)srcFormat << std::endl; + // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, + if (srcFormat == ImageFormat::GRAY) { + std::cout << "srcFormat: GRAY" << std::endl; + } + if (srcFormat == ImageFormat::BGR) { + std::cout << "srcFormat: BGR" << std::endl; + } + if (srcFormat == ImageFormat::RGB) { + std::cout << "srcFormat: RGB" << std::endl; + } + std::cout << " output tensor size, num=" << 1 << ", channel=" << 1 + << ", height=" << dsth << ", width=" << dstw + << ", dstFormat= " << (ImageFormat)dstFormat << std::endl; + + if (dstFormat == ImageFormat::GRAY) { + std::cout << "dstFormat: GRAY" << std::endl; + } + if (dstFormat == ImageFormat::BGR) { + std::cout << "dstFormat: BGR" << std::endl; + } + if (dstFormat == ImageFormat::RGB) { + std::cout << "dstFormat: RGB" << std::endl; + } + + std::cout << "Rotate = " << rotate << ", Flip = " << flip + << ", Layout = " << static_cast(layout) << std::endl; + if (static_cast(layout) != 1 && static_cast(layout) != 3) { + std::cout << "this layout" << static_cast(layout) + << " is no support" << std::endl; + } + int size = 3 * srch * srcw; + if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + size = 3 * srch * srcw; + } else if (srcFormat == ImageFormat::GRAY) { + size = srch * srcw; + } + uint8_t* src = img.data; + + int out_size = srch * srcw; + int resize = dstw * dsth; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + out_size = 3 * srch * srcw; + resize = 3 * dsth * dstw; + } else if (dstFormat == ImageFormat::GRAY) { + out_size = srch * srcw; + resize = dsth * dstw; + } + // out + uint8_t* lite_dst = new uint8_t[out_size]; + uint8_t* resize_tmp = new uint8_t[resize]; + uint8_t* tv_out_ratote = new uint8_t[out_size]; + uint8_t* tv_out_flip = new uint8_t[out_size]; + std::vector shape_out = {1, 3, srch, srcw}; + + input_tensor->Resize(shape_out); + Tensor dst_tensor = *input_tensor; + std::cout << "opencv compute" << std::endl; + cv::Mat im_convert; + cv::Mat im_resize; + cv::Mat im_rotate; + cv::Mat im_flip; + double to_1 = 0; + double to_2 = 0; + double to_3 = 0; + double to_4 = 0; + double to1 = 0; + for (int i = 0; i < test_iter; i++) { + clock_t start = clock(); + clock_t begin = clock(); + // convert bgr-gray + if (dstFormat == srcFormat) { + im_convert = img; + } else if (dstFormat == ImageFormat::BGR && + srcFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR); + } else if (srcFormat == ImageFormat::BGR && + dstFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY); + } else if (dstFormat == srcFormat) { + printf("convert format error \n"); + return; + } + clock_t end = clock(); + to_1 += (end - begin); + + begin = clock(); + // resize default linear + cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f); + end = clock(); + to_2 += (end - begin); + + begin = clock(); + // rotate 90 + if (rotate == 90) { + cv::flip(im_convert.t(), im_rotate, 1); + } else if (rotate == 180) { + cv::flip(im_convert, im_rotate, -1); + } else if (rotate == 270) { + cv::flip(im_convert.t(), im_rotate, 0); + } + end = clock(); + to_3 += (end - begin); + + begin = clock(); + // flip + cv::flip(im_convert, im_flip, flip); + end = clock(); + to_4 += (end - begin); + clock_t ovet = clock(); + to1 += (ovet - start); + } + + std::cout << "Paddle-lite compute" << std::endl; + double lite_to = 0; + double lite_to_1 = 0; + double lite_to_2 = 0; + double lite_to_3 = 0; + double lite_to_4 = 0; + double lite_to_5 = 0; + TransParam tparam; + tparam.ih = srch; + tparam.iw = srcw; + tparam.oh = dsth; + tparam.ow = dstw; + tparam.flip_param = flip; + tparam.rotate_param = rotate; + + ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); + + for (int i = 0; i < test_iter; ++i) { + clock_t start = clock(); + clock_t begin = clock(); + image_preprocess.imageConvert(src, lite_dst); + clock_t end = clock(); + lite_to_1 += (end - begin); + + begin = clock(); + image_preprocess.imageResize(lite_dst, resize_tmp); + end = clock(); + lite_to_2 += (end - begin); + + begin = clock(); + image_preprocess.imageRotate( + lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90); + end = clock(); + lite_to_3 += (end - begin); + + begin = clock(); + image_preprocess.imageFlip( + lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip); + end = clock(); + lite_to_4 += (end - begin); + + clock_t over = clock(); + lite_to += (over - start); + + begin = clock(); + image_preprocess.image2Tensor(lite_dst, + &dst_tensor, + (ImageFormat)dstFormat, + srcw, + srch, + layout, + means, + scales); + end = clock(); + lite_to_5 += (end - begin); + } + to_1 = 1000 * to_1 / CLOCKS_PER_SEC; + to_2 = 1000 * to_2 / CLOCKS_PER_SEC; + to_3 = 1000 * to_3 / CLOCKS_PER_SEC; + to_4 = 1000 * to_4 / CLOCKS_PER_SEC; + to1 = 1000 * to1 / CLOCKS_PER_SEC; + std::cout << "opencv convert run time: " << to_1 + << "ms, avg: " << to_1 / test_iter << std::endl; + std::cout << "opencv resize run time: " << to_2 + << "ms, avg: " << to_2 / test_iter << std::endl; + std::cout << "opencv rotate run time: " << to_3 + << "ms, avg: " << to_3 / test_iter << std::endl; + std::cout << "opencv flip time: " << to_4 + << "ms, avg: " << to_4 / test_iter << std::endl; + std::cout << "opencv total run time: " << to1 + << "ms, avg: " << to1 / test_iter << std::endl; + std::cout << "------" << std::endl; + + lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC; + lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC; + lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC; + lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC; + lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC; + lite_to = 1000 * lite_to / CLOCKS_PER_SEC; + std::cout << "lite convert run time: " << lite_to_1 + << "ms, avg: " << lite_to_1 / test_iter << std::endl; + std::cout << "lite resize run time: " << lite_to_2 + << "ms, avg: " << lite_to_2 / test_iter << std::endl; + std::cout << "lite rotate run time: " << lite_to_3 + << "ms, avg: " << lite_to_3 / test_iter << std::endl; + std::cout << "lite flip time: " << lite_to_4 + << "ms, avg: " << lite_to_4 / test_iter << std::endl; + std::cout << "lite total run time: " << lite_to + << "ms, avg: " << lite_to / test_iter << std::endl; + std::cout << "lite img2tensor time: " << lite_to_5 + << "ms, avg: " << lite_to_5 / test_iter << std::endl; + std::cout << "------" << std::endl; + + double max_ratio = 0; + double max_diff = 0; + const double eps = 1e-6f; + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/resize.jpg"; + std::string convert_name = dst_path + "/convert.jpg"; + std::string rotate_name = dst_path + "/rotate.jpg"; + std::string flip_name = dst_path + "/flip.jpg"; + cv::Mat resize_mat(dsth, dstw, CV_8UC3); + cv::Mat convert_mat(srch, srcw, CV_8UC3); + cv::Mat rotate_mat; + if (rotate == 90 || rotate == 270) { + rotate_mat = cv::Mat(srcw, srch, CV_8UC3); + } else { + rotate_mat = cv::Mat(srch, srcw, CV_8UC3); + } + cv::Mat flip_mat(srch, srcw, CV_8UC3); + fill_with_mat(resize_mat, resize_tmp); + fill_with_mat(convert_mat, lite_dst); + fill_with_mat(rotate_mat, tv_out_ratote); + fill_with_mat(flip_mat, tv_out_flip); + cv::imwrite(convert_name, convert_mat); + cv::imwrite(resize_name, resize_mat); + cv::imwrite(rotate_name, rotate_mat); + cv::imwrite(flip_name, flip_mat); + delete[] lite_dst; + delete[] resize_tmp; + delete[] tv_out_ratote; + delete[] tv_out_flip; + } + } +} + +int main(int argc, char** argv) { + if (argc < 7) { + std::cerr << "[ERROR] usage: " << argv[0] + << " image_path dst_apth srcFormat dstFormat width height\n"; + exit(1); + } + std::string image_path = argv[1]; + std::string dst_path = argv[2]; + int srcFormat = atoi(argv[3]); + int dstFormat = atoi(argv[4]); + int width = atoi(argv[5]); + int height = atoi(argv[6]); + int flip = -1; + float rotate = 90; + int layout = 1; + std::string model_dir = "mobilenet_v1"; + if (argc > 7) { + model_dir = argv[7]; + } + if (argc > 8) { + flip = atoi(argv[8]); + } + if (argc > 9) { + rotate = atoi(argv[9]); + } + if (argc > 10) { + layout = atoi(argv[10]); + } + test_img({3}, + {1, 2, 4}, + image_path, + dst_path, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + width, + height, + rotate, + (FlipParam)flip, + (LayoutType)layout, + model_dir, + 20); + return 0; +} diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc new file mode 100644 index 0000000000000000000000000000000000000000..24f408bf4a55ea2d499e39902201597c0e8c6e4e --- /dev/null +++ b/lite/demo/cxx/test_cv/test_model_cv.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT +#include "paddle_image_preprocess.h" // NOLINT +#include "time.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale( + const float* din, float* dout, int size, float* mean, float* scale) { + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) * scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) * scale[2]; + } +} +void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) { +#ifdef LITE_WITH_CV + typedef paddle::lite::utils::cv::ImageFormat ImageFormat; + typedef paddle::lite::utils::cv::FlipParam FlipParam; + typedef paddle::lite::utils::cv::TransParam TransParam; + typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; + typedef paddle::lite_api::DataLayoutType LayoutType; + // init TransParam + TransParam tp; + tp.iw = img.cols; + tp.ih = img.rows; + tp.ow = width; + tp.oh = height; + ImageFormat srcFormat = ImageFormat::BGR; + ImageFormat dstFormat = ImageFormat::RGB; + // init ImagePreprocess + ImagePreprocess img_process(srcFormat, dstFormat, tp); + // init temp var + const uint8_t* img_ptr = reinterpret_cast(img.data); + uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3]; + uint8_t* resize_ptr = new uint8_t[width * height * 3]; + // do convert bgr--rgb + img_process.imageConvert(img_ptr, rgb_ptr); + // do resize + img_process.imageResize(rgb_ptr, resize_ptr); + // data--tensor and normalize + float means[3] = {103.94f, 116.78f, 123.68f}; + float scales[3] = {0.017f, 0.017f, 0.017f}; + img_process.image2Tensor( + resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales); + float* data = dstTensor.mutable_data(); +#else + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + const float* dimg = reinterpret_cast(imgf.data); + float* data = dstTensor.mutable_data(); + neon_mean_scale(dimg, data, width * height, means, scales); +#endif +} + +void RunModel(std::string model_dir, + std::string img_path, + std::vector input_shape, + PowerMode power_mode, + int thread_num, + int test_iter, + int warmup = 0) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + // 3. Prepare input data from image + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize( + {input_shape[0], input_shape[1], input_shape[2], input_shape[3]}); + auto* data = input_tensor->mutable_data(); + // read img and pre-process + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + + pre_process(img, input_shape[3], input_shape[2], *input_tensor); + + // 4. Run predictor + for (int i = 0; i < warmup; ++i) { + predictor->Run(); + } + double lps = 0.f; + double min_time = 1000000.f; + double max_time = 0.f; + for (int i = 0; i < test_iter; ++i) { + clock_t begin = clock(); + predictor->Run(); + clock_t end = clock(); + double t = (end - begin) * 1000; + t = t / CLOCKS_PER_SEC; + lps += t; + if (t < min_time) { + min_time = t; + } + if (t > max_time) { + max_time = t; + } + std::cout << "iter: " << i << ", time: " << t << " ms" << std::endl; + } + std::cout << "================== Speed Report ===================" + << std::endl; + std::cout << "Model: " << model_dir + << ", power_mode: " << static_cast(power_mode) + << ", threads num " << thread_num << ", warmup: " << warmup + << ", repeats: " << test_iter << ", avg time: " << lps / test_iter + << " ms" + << ", min time: " << min_time << " ms" + << ", max time: " << max_time << " ms." << std::endl; + + // 5. Get output and post process + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto* outptr = output_tensor->data(); + auto shape_out = output_tensor->shape(); + int output_num = 1; + for (int i = 0; i < shape_out.size(); ++i) { + output_num *= shape_out[i]; + } + std::cout << "output_num: " << output_num << std::endl; + for (int i = 0; i < output_num; i += 100) { + std::cout << "i: " << i << ", out: " << outptr[i] << std::endl; + } +} + +int main(int argc, char** argv) { + if (argc < 7) { + std::cerr << "[ERROR] usage: " << argv[0] + << " model_dir image_path input_shape\n"; + exit(1); + } + std::string model_dir = argv[1]; + std::string img_path = argv[2]; + std::vector input_shape; + input_shape.push_back(atoi(argv[3])); + input_shape.push_back(atoi(argv[4])); + input_shape.push_back(atoi(argv[5])); + input_shape.push_back(atoi(argv[6])); + int power_mode = 3; + int threads = 1; + int test_iter = 100; + int warmup = 10; + if (argc > 7) { + power_mode = atoi(argv[7]); + } + if (argc > 8) { + threads = atoi(argv[8]); + } + if (argc > 9) { + test_iter = atoi(argv[9]); + } + if (argc > 10) { + warmup = atoi(argv[10]); + } + RunModel(model_dir, + img_path, + input_shape, + (PowerMode)power_mode, + threads, + test_iter, + warmup); + return 0; +} diff --git a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc new file mode 100644 index 0000000000000000000000000000000000000000..a9beb1ed28de1f3c28eb5c03b3b660d518ee10c5 --- /dev/null +++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc @@ -0,0 +1,238 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +struct Object { + cv::Rect rec; + int class_id; + float prob; +}; + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +const char* class_names[] = {"person", "bicycle", "car", + "motorcycle", "airplane", "bus", + "train", "truck", "boat", + "traffic light", "fire hydrant", "stop sign", + "parking meter", "bench", "bird", + "cat", "dog", "horse", + "sheep", "cow", "elephant", + "bear", "zebra", "giraffe", + "backpack", "umbrella", "handbag", + "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", + "kite", "baseball bat", "baseball glove", + "skateboard", "surfboard", "tennis racket", + "bottle", "wine glass", "cup", + "fork", "knife", "spoon", + "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", + "carrot", "hot dog", "pizza", + "donut", "cake", "chair", + "couch", "potted plant", "bed", + "dining table", "toilet", "tv", + "laptop", "mouse", "remote", + "keyboard", "cell phone", "microwave", + "oven", "toaster", "sink", + "refrigerator", "book", "clock", + "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; + +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale(const float* din, + float* dout, + int size, + const std::vector mean, + const std::vector scale) { + if (mean.size() != 3 || scale.size() != 3) { + std::cerr << "[ERROR] mean or scale size must equal to 3\n"; + exit(1); + } + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) * scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) * scale[2]; + } +} + +void pre_process(const cv::Mat& img, int width, int height, float* data) { + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize( + rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + std::vector mean = {0.485f, 0.456f, 0.406f}; + std::vector scale = {0.229f, 0.224f, 0.225f}; + const float* dimg = reinterpret_cast(imgf.data); + neon_mean_scale(dimg, data, width * height, mean, scale); +} + +std::vector detect_object(const float* data, + int count, + float thresh, + cv::Mat& image) { // NOLINT + if (data == nullptr) { + std::cerr << "[ERROR] data can not be nullptr\n"; + exit(1); + } + std::vector rect_out; + for (int iw = 0; iw < count; iw++) { + int oriw = image.cols; + int orih = image.rows; + if (data[1] > thresh) { + Object obj; + int x = static_cast(data[2]); + int y = static_cast(data[3]); + int w = static_cast(data[4] - data[2] + 1); + int h = static_cast(data[5] - data[3] + 1); + cv::Rect rec_clip = + cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows); + obj.class_id = static_cast(data[0]); + obj.prob = data[1]; + obj.rec = rec_clip; + if (w > 0 && h > 0 && obj.prob <= 1) { + rect_out.push_back(obj); + cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 1, cv::LINE_AA); + std::string str_prob = std::to_string(obj.prob); + std::string text = std::string(class_names[obj.class_id]) + ": " + + str_prob.substr(0, str_prob.find(".") + 4); + int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL; + double font_scale = 1.f; + int thickness = 1; + cv::Size text_size = + cv::getTextSize(text, font_face, font_scale, thickness, nullptr); + float new_font_scale = w * 0.5 * font_scale / text_size.width; + text_size = cv::getTextSize( + text, font_face, new_font_scale, thickness, nullptr); + cv::Point origin; + origin.x = x + 3; + origin.y = y + text_size.height + 3; + cv::putText(image, + text, + origin, + font_face, + new_font_scale, + cv::Scalar(0, 255, 255), + thickness, + cv::LINE_AA); + + std::cout << "detection, image size: " << image.cols << ", " + << image.rows + << ", detect object: " << class_names[obj.class_id] + << ", score: " << obj.prob << ", location: x=" << x + << ", y=" << y << ", width=" << w << ", height=" << h + << std::endl; + } + } + data += 6; + } + return rect_out; +} + +void RunModel(std::string model_dir, std::string img_path) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + const int in_width = 608; + const int in_height = 608; + + // 3. Prepare input data from image + // input 0 + std::unique_ptr input_tensor0(std::move(predictor->GetInput(0))); + input_tensor0->Resize({1, 3, in_height, in_width}); + auto* data0 = input_tensor0->mutable_data(); + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + pre_process(img, in_width, in_height, data0); + // input1 + std::unique_ptr input_tensor1(std::move(predictor->GetInput(1))); + input_tensor1->Resize({1, 2}); + auto* data1 = input_tensor1->mutable_data(); + data1[0] = img.rows; + data1[1] = img.cols; + + // 4. Run predictor + predictor->Run(); + + // 5. Get output and post process + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto* outptr = output_tensor->data(); + auto shape_out = output_tensor->shape(); + int64_t cnt = 1; + for (auto& i : shape_out) { + cnt *= i; + } + auto rec_out = detect_object(outptr, static_cast(cnt / 6), 0.5f, img); + std::string result_name = + img_path.substr(0, img_path.find(".")) + "_yolov3_detection_result.jpg"; + cv::imwrite(result_name, img); +} + +int main(int argc, char** argv) { + if (argc < 3) { + std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n"; + exit(1); + } + std::string model_dir = argv[1]; + std::string img_path = argv[2]; + RunModel(model_dir, img_path); + return 0; +} diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index ce8b8365a8c55796772e7fbbe672ead682343a60..74b86c519e44f3aec5f0fbc7f3e2b3aa8d39c554 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -1,6 +1,6 @@ # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered # to the model_optimize_tool. -if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)) +if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) return() endif() diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.cc b/lite/kernels/arm/collect_fpn_proposals_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.h b/lite/kernels/arm/collect_fpn_proposals_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/arm/conditional_block_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/conditional_block_compute.h b/lite/kernels/arm/conditional_block_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index 8c76f243a647553de198bea5c581a620d99bbbeb..52849a026e1b23a0fa9757030764a0012092ef4c 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -110,8 +110,7 @@ void ConvCompute::PrepareForRun() { bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); - bool flag_dw_5x5 = pads_all_equal && - ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2)); + bool flag_dw_5x5 = pads_all_equal && (kw == 5 && sw == 1); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; if (param.groups == ic && ic == oc && kps_equal && pads_equal && @@ -156,8 +155,7 @@ void ConvCompute::PrepareForRun() { bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); - bool flag_dw_5x5 = pads_all_equal && - ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2)); + bool flag_dw_5x5 = pads_all_equal && (kw == 5 && sw == 1); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; if (param.groups == ic && ic == oc && kps_equal && pads_equal && diff --git a/lite/kernels/arm/conv_transpose_compute_test.cc b/lite/kernels/arm/conv_transpose_compute_test.cc deleted file mode 100644 index 298c651d9f86a5fc3d527cd7b973b1a24160ec42..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/conv_transpose_compute_test.cc +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/conv_transpose_compute.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -template -static void basic_gemm(int m, - int n, - int k, - const type* a, - const type* b, - const type2* bias, - type2* c, - type2 alpha, - type2 beta, - bool trans_a = false, - bool trans_b = false, - bool flag_bias = false, - bool flag_relu = false) { -#pragma omp parallel for - for (int i = 0; i < m; ++i) { - type2 bias_data = (type2)0; - if (flag_bias) { - bias_data = bias[i]; - } - for (int j = 0; j < n; ++j) { - type2 sum = static_cast(0); - for (int l = 0; l < k; ++l) { - type av; - type bv; - if (trans_a) { - av = a[l * m + i]; - } else { - av = a[i * k + l]; - } - if (trans_b) { - bv = b[j * k + l]; - } else { - bv = b[l * n + j]; - } - sum += av * bv; - } - type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data; - if (flag_relu) { - c[i * n + j] = tmp > (type2)0 ? tmp : (type2)0; - } else { - c[i * n + j] = tmp; - } - } - } -} - -//! for float, dtype1 and type2 is float -//! for int8, dytpe1 is char, dtype2 is int -template -bool deconv_basic(const Dtype1* din, - Dtype2* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const Dtype1* weights, - const Dtype2* bias, - int group, - int kernel_w, - int kernel_h, - int stride_w, - int stride_h, - int dila_w, - int dila_h, - int pad_w, - int pad_h, - bool flag_bias, - bool flag_relu) { - int m = chout * kernel_w * kernel_h / group; - int n = hin * win; - int k = chin / group; - - if (chin != chout || group != chin) { - CHECK_OR_FALSE(chin % group == 0); - CHECK_OR_FALSE(chout % group == 0); - } - - lite::Tensor workspace_tensor; - std::vector wt_shape = {1, 1, 1, group * m * n}; - workspace_tensor.Resize(wt_shape); - auto* workspace_ptr = workspace_tensor.mutable_data(); - - int group_size_in = win * hin * chin / group; - int group_size_out = wout * hout * chout / group; - int group_size_coldata = m * n; - int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); - bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && - (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && - (dila_w == 1) && (dila_h == 1); - - for (int i = 0; i < num; ++i) { - const Dtype1* din_batch = din + i * chin * hin * win; - Dtype2* dout_batch = dout + i * chout * hout * wout; - - Dtype2* col_data = workspace_ptr; - if (flag_1x1s1p1) { - col_data = dout_batch; - } - memset(col_data, 0, sizeof(Dtype2) * group_size_coldata); - for (int g = 0; g < group; ++g) { - const Dtype1* din_group = din_batch + g * group_size_in; - const Dtype1* weights_group = weights + g * group_size_weights; - Dtype2* coldata_group = col_data + g * group_size_coldata; - basic_gemm(m, - n, - k, - weights_group, - din_group, - nullptr, - coldata_group, - (Dtype2)1, - (Dtype2)0, - true, - false, - false, - (!flag_bias && flag_relu)); - } - if (!flag_1x1s1p1) { - lite::arm::math::col2im(col_data, - chout, - hout, - wout, - kernel_h, - kernel_w, - pad_h, - pad_w, - stride_h, - stride_w, - dila_h, - dila_w, - dout_batch); - } - if (flag_bias) { - lite::arm::math::fill_bias_relu( - dout_batch, bias, chout, wout * hout, flag_bias, flag_relu); - } - } - return true; -} - -template -void conv2d_transpose_compute_ref(const operators::ConvParam& param) { - const Dtype1* din = param.x->data(); - Dtype2* dout = param.output->mutable_data(); - - int num = param.x->dims()[0]; - int chout = param.output->dims()[1]; - int hout = param.output->dims()[2]; - int wout = param.output->dims()[3]; - - int chin = param.x->dims()[1]; - int hin = param.x->dims()[2]; - int win = param.x->dims()[3]; - - const Dtype1* weights = param.filter->mutable_data(); - Dtype2* bias = nullptr; - if (param.bias != nullptr) { - bias = param.bias->mutable_data(); - } - - int group = param.groups; - int kernel_h = param.filter->dims()[2]; - int kernel_w = param.filter->dims()[3]; - int stride_h = param.strides[0]; - int stride_w = param.strides[1]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; - bool flag_bias = (param.bias != nullptr); - bool flag_relu = param.fuse_relu; - - deconv_basic(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - group, - kernel_w, - kernel_h, - stride_w, - stride_h, - dila_w, - dila_h, - pad_w, - pad_h, - flag_bias, - flag_relu); -} - -TEST(conv2d_transpose_arm, retrive_op) { - auto op = KernelRegistry::Global().Create( - "conv2d_transpose"); - ASSERT_FALSE(op.empty()); - ASSERT_TRUE(op.front()); -} - -TEST(conv2d_transpose_arm, init) { - Conv2DTransposeCompute compute; - ASSERT_EQ(compute.precision(), PRECISION(kFloat)); - ASSERT_EQ(compute.target(), TARGET(kARM)); -} - -TEST(conv2d_transpose_arm, compute) { - DeviceInfo::Init(); - for (auto n : {1, 2}) { - for (auto ic : {1, 3 /*, 128*/}) { - for (auto oc : {1, 3 /*, 128*/}) { - for (auto ih : {2, 8 /*, 56 , 112, 224, 512*/}) { - for (auto iw : {2, 8 /*, 56, 112, 224, 512*/}) { - for (auto flag_bias : {false, true}) { - for (auto flag_relu : {false, true}) { - for (auto dilation : {1, 2}) { - for (auto stride : {1, 2}) { - for (auto padding : {0, 1, 2}) { - for (auto ks : {2, 3, 5}) { - for (auto group : {1, 2}) { - // obtain shape - if (ic % group != 0 || oc % group != 0) { - group = 1; - } - std::vector input_shape = {n, ic, ih, iw}; - std::vector filter_shape = { - oc / group, ic, ks, ks}; - int oh = (ih - 1) * stride - 2 * padding + - dilation * (ks - 1) + 1; - int ow = (iw - 1) * stride - 2 * padding + - dilation * (ks - 1) + 1; - if (oh < 1 || ow < 1) { - break; - } - std::vector output_shape = {n, oc, oh, ow}; - std::vector bias_shape = {1, oc, 1, 1}; - - // define and resize tensor - Tensor input; - Tensor filter; - Tensor filter_copy; - Tensor bias; - Tensor output; - Tensor output_ref; - input.Resize(input_shape); - filter.Resize(filter_shape); - filter_copy.Resize(filter_shape); - output.Resize(output_shape); - output_ref.Resize(output_shape); - auto* input_data = input.mutable_data(); - auto* filter_data = filter.mutable_data(); - auto* filter_copy_data = - filter_copy.mutable_data(); - auto* output_data = output.mutable_data(); - - // initialize tensor - for (int i = 0; i < input.dims().production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - input_data[i] = sign * static_cast(i % 128); - } - for (int i = 0; i < filter.dims().production(); i++) { - filter_data[i] = - i / - static_cast(filter.dims().production()); - filter_copy_data[i] = - i / static_cast( - filter_copy.dims().production()); - } - if (flag_bias) { - bias.Resize(bias_shape); - auto* bias_data = bias.mutable_data(); - for (int i = 0; i < bias.dims().production(); i++) { - bias_data[i] = static_cast(i); - } - } - - // prepare kernel params and run - std::unique_ptr ctx(new KernelContext); - ctx->As(); - Conv2DTransposeCompute conv2d_transpose; - conv2d_transpose.SetContext(std::move(ctx)); - operators::ConvParam param; - param.x = &input; - param.filter = &filter; - param.output = &output; - param.bias = nullptr; - if (flag_bias) { - bias.Resize(bias_shape); - auto* bias_data = bias.mutable_data(); - for (int i = 0; i < bias.dims().production(); i++) { - bias_data[i] = static_cast(i); - } - param.bias = &bias; - } - param.fuse_relu = flag_relu; - param.paddings = std::vector({padding, padding}); - param.strides = std::vector({stride, stride}); - param.dilations = - std::vector({dilation, dilation}); - param.groups = group; - conv2d_transpose.SetParam(param); - conv2d_transpose.Launch(); - - // invoking ref implementation and compare results - param.filter = &filter_copy; - param.output = &output_ref; - conv2d_transpose_compute_ref(param); - auto* output_ref_data = - output_ref.mutable_data(); - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR( - output_data[i], output_ref_data[i], 1e-3); - } - } - } - } - } - } - } - } - } - } - } - } - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle -USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def); diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.cc b/lite/kernels/arm/distribute_fpn_proposals_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.h b/lite/kernels/arm/distribute_fpn_proposals_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/grid_sampler_compute.cc b/lite/kernels/arm/grid_sampler_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/grid_sampler_compute.h b/lite/kernels/arm/grid_sampler_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/instance_norm_compute.cc b/lite/kernels/arm/instance_norm_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/instance_norm_compute.h b/lite/kernels/arm/instance_norm_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/merge_lod_tensor_compute.cc b/lite/kernels/arm/merge_lod_tensor_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/merge_lod_tensor_compute.h b/lite/kernels/arm/merge_lod_tensor_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/reduce_prod_compute.cc b/lite/kernels/arm/reduce_prod_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/reduce_prod_compute.h b/lite/kernels/arm/reduce_prod_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/split_lod_tensor_compute.cc b/lite/kernels/arm/split_lod_tensor_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/split_lod_tensor_compute.h b/lite/kernels/arm/split_lod_tensor_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/yolo_box_compute.cc b/lite/kernels/arm/yolo_box_compute.cc index ad8a630b8c0064af7358674d1b7424eff25a194a..38443bf27726ee879b38e3058c8d3a48df975baf 100644 --- a/lite/kernels/arm/yolo_box_compute.cc +++ b/lite/kernels/arm/yolo_box_compute.cc @@ -32,6 +32,8 @@ void YoloBoxCompute::Run() { int class_num = param.class_num; float conf_thresh = param.conf_thresh; int downsample_ratio = param.downsample_ratio; + Boxes->clear(); + Scores->clear(); lite::arm::math::yolobox(X, ImgSize, Boxes, diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index bf59d0272611d314dcee41c620bb3f9b3ca08c7e..2df00f00a4eefd8fc6f9bee5e0c9b76656232041 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_CUDA) +if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA)) return() endif() diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc index 2ebd7e33baf8e12cfce24661f186382152b6bb89..46b63f2e310d2e24a3935eb2f66c8c9d4a339712 100644 --- a/lite/kernels/cuda/conv_compute_test.cc +++ b/lite/kernels/cuda/conv_compute_test.cc @@ -15,6 +15,7 @@ #include "lite/kernels/cuda/conv_compute.h" #include #include +#include #include #include diff --git a/lite/kernels/cuda/elementwise_add_compute.cu b/lite/kernels/cuda/elementwise_add_compute.cu deleted file mode 100644 index 4bacf532a2b67168679449200b1af721b7a282c8..0000000000000000000000000000000000000000 --- a/lite/kernels/cuda/elementwise_add_compute.cu +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/backends/cuda/math/elementwise.h" -#include "lite/core/op_registry.h" -#include "lite/kernels/cuda/elementwise_add_compute.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace cuda { - -void ElementwiseAddCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto stream = ctx.exec_stream(); - - const lite::Tensor* x = param.X; - const lite::Tensor* y = param.Y; - lite::Tensor* out = param.Out; - - CHECK(x->dims().production() == y->dims().production()); - - auto* x_data = x->data(); - auto* y_data = y->data(); - auto out_data = out->mutable_data(TARGET(kCUDA)); - - int pixel_num = x->numel(); - lite::cuda::math::elementwise_add( - pixel_num, x_data, y_data, out_data, stream); - - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); -} - -void ElementwiseAddComputeNHWC::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto stream = ctx.exec_stream(); - - const lite::Tensor* x = param.X; - const lite::Tensor* y = param.Y; - lite::Tensor* out = param.Out; - - CHECK(x->dims().production() == y->dims().production()); - - auto* x_data = x->data(); - auto* y_data = y->data(); - auto out_data = out->mutable_data(TARGET(kCUDA)); - - int pixel_num = x->numel(); - lite::cuda::math::elementwise_add( - pixel_num, x_data, y_data, out_data, stream); - - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); -} - -void ElementwiseAddComputeInt8::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto stream = ctx.exec_stream(); - - const lite::Tensor* x = param.X; - const lite::Tensor* y = param.Y; - lite::Tensor* out = param.Out; - - CHECK(x->dims().production() == y->dims().production()); - - const int c = x->dims()[3]; - - auto* x_data = x->data(); - auto* y_data = y->data(); - auto out_data = out->mutable_data(TARGET(kCUDA)); - - int pixel_num = x->numel(); - float output_scale = param.output_scale; - if (c % 4 == 0) { - lite::cuda::math::elementwise_add_nhwc4_int8( - pixel_num / 4, - static_cast(x_data), - static_cast(y_data), - 1. / output_scale, - static_cast(out_data), - stream); - } else { - lite::cuda::math::elementwise_add_int8( - pixel_num, x_data, y_data, 1. / output_scale, out_data, stream); - } - - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); -} - -} // namespace cuda -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(elementwise_add, - kCUDA, - kFloat, - kNCHW, - paddle::lite::kernels::cuda::ElementwiseAddCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) - .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) - .Finalize(); - -REGISTER_LITE_KERNEL(elementwise_add, - kCUDA, - kFloat, - kNHWC, - paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC, - nhwc_format) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kCUDA), - PRECISION(kFloat), - DATALAYOUT(kNHWC))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kCUDA), - PRECISION(kFloat), - DATALAYOUT(kNHWC))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kCUDA), - PRECISION(kFloat), - DATALAYOUT(kNHWC))}) - .Finalize(); diff --git a/lite/kernels/cuda/elementwise_add_compute.h b/lite/kernels/cuda/elementwise_add_compute.h deleted file mode 100644 index 5c3fecc5d894aeea2bc5260b1815bbfa718eb5c6..0000000000000000000000000000000000000000 --- a/lite/kernels/cuda/elementwise_add_compute.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "lite/core/kernel.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace cuda { - -class ElementwiseAddCompute - : public KernelLite { - public: - using param_t = operators::ElementwiseParam; - - void Run() override; - virtual ~ElementwiseAddCompute() = default; -}; - -class ElementwiseAddComputeNHWC - : public KernelLite { - public: - using param_t = operators::ElementwiseParam; - - void Run() override; - virtual ~ElementwiseAddComputeNHWC() = default; -}; - -class ElementwiseAddComputeInt8 - : public KernelLite { - public: - using param_t = operators::ElementwiseParam; - - void Run() override; - virtual ~ElementwiseAddComputeInt8() = default; -}; - -} // namespace cuda -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/cuda/elementwise_add_compute_test.cc b/lite/kernels/cuda/elementwise_add_compute_test.cc deleted file mode 100644 index cc63f1470b65de37eb73c71701a83146e12778ae..0000000000000000000000000000000000000000 --- a/lite/kernels/cuda/elementwise_add_compute_test.cc +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/cuda/elementwise_add_compute.h" -#include -#include -#include -#include "lite/api/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace cuda { - -using Tensor = lite::Tensor; - -static void ElementwiseAddRef(float* x, float* y, float* out, int num) { - for (int i = 0; i < num; ++i) { - out[i] = x[i] + y[i]; - } -} - -TEST(elementwise_add, normal) { - ElementwiseAddCompute elementwise_add_kernel; - std::unique_ptr ctx(new KernelContext); - auto& context = ctx->As(); - - operators::ElementwiseParam param; - Tensor x, y, out; - Tensor x_cpu, y_cpu, out_cpu; - Tensor x_ref, y_ref, out_ref; - - const int n = 1; - const int c = 3; - const int h = 2000; - const int w = 2000; - - x.Resize({n, c, h, w}); - y.Resize({n, c, h, w}); - out.Resize({n, c, h, w}); - x_cpu.Resize({n, c, h, w}); - y_cpu.Resize({n, c, h, w}); - out_cpu.Resize({n, c, h, w}); - x_ref.Resize({n, c, h, w}); - y_ref.Resize({n, c, h, w}); - out_ref.Resize({n, c, h, w}); - - auto* out_data = out.mutable_data(TARGET(kCUDA)); - - auto* x_cpu_data = x_cpu.mutable_data(); - auto* y_cpu_data = y_cpu.mutable_data(); - auto* out_cpu_data = out_cpu.mutable_data(); - - auto* x_ref_data = x_ref.mutable_data(); - auto* y_ref_data = y_ref.mutable_data(); - auto* out_ref_data = out_ref.mutable_data(); - - for (int i = 0; i < x_cpu.numel(); ++i) { - x_cpu_data[i] = i + 5.0; - x_ref_data[i] = i + 5.0; - } - for (int i = 0; i < y_cpu.numel(); ++i) { - y_cpu_data[i] = i - 5.0; - y_ref_data[i] = i - 5.0; - } - - x.Assign(x_cpu_data, x_cpu.dims()); - y.Assign(y_cpu_data, y_cpu.dims()); - - param.X = &x; - param.Y = &y; - param.Out = &out; - elementwise_add_kernel.SetParam(param); - - cudaStream_t stream; - cudaStreamCreate(&stream); - context.SetExecStream(stream); - - elementwise_add_kernel.SetContext(std::move(ctx)); - elementwise_add_kernel.Launch(); - cudaDeviceSynchronize(); - - CopySync( - out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); - ElementwiseAddRef(x_ref_data, y_ref_data, out_ref_data, out.numel()); - for (int i = 0; i < out.numel(); i++) { - EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(elementwise_add, int8_out) { - ElementwiseAddComputeInt8 elementwise_add_kernel; - std::unique_ptr ctx(new KernelContext); - auto& context = ctx->As(); - - operators::ElementwiseParam param; - Tensor x, y, out; - Tensor x_cpu, y_cpu, out_cpu; - - const int n = 1; - const int h = 36; - const int w = 36; - const int c = 125; - - x.Resize({n, h, w, c}); - y.Resize({n, h, w, c}); - out.Resize({n, h, w, c}); - x_cpu.Resize({n, h, w, c}); - y_cpu.Resize({n, h, w, c}); - out_cpu.Resize({n, h, w, c}); - - auto* out_data = out.mutable_data(TARGET(kCUDA)); - - auto* x_cpu_data = x_cpu.mutable_data(); - auto* y_cpu_data = y_cpu.mutable_data(); - auto* out_cpu_data = out_cpu.mutable_data(); - - for (int i = 0; i < x_cpu.numel(); ++i) { - x_cpu_data[i] = i + 5.0; - } - for (int i = 0; i < y_cpu.numel(); ++i) { - y_cpu_data[i] = i; - } - - x.Assign(x_cpu_data, x_cpu.dims()); - y.Assign(y_cpu_data, y_cpu.dims()); - - param.X = &x; - param.Y = &y; - param.Out = &out; - param.output_scale = 50 / 127.; - elementwise_add_kernel.SetParam(param); - - cudaStream_t stream; - cudaStreamCreate(&stream); - context.SetExecStream(stream); - - elementwise_add_kernel.SetContext(std::move(ctx)); - auto start = GetCurrentUS(); - for (int i = 0; i < 1000000; i++) { - elementwise_add_kernel.Launch(); - } - LOG(INFO) << "time: " << (GetCurrentUS() - start) / 1000000.; - - CopySync( - out_cpu_data, out_data, sizeof(int8_t) * out.numel(), IoDirection::DtoH); - for (int i = 0; i < out.numel(); i++) { - // LOG(INFO) << float(out_cpu_data[i]); - } -} - -} // namespace cuda -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/cuda/mul_compute.h b/lite/kernels/cuda/mul_compute.h index c2fc4364ef77742858b143734d2ecf4d13e201e9..320b562128583f7393ca3e1edb3e8bc1c30136ec 100644 --- a/lite/kernels/cuda/mul_compute.h +++ b/lite/kernels/cuda/mul_compute.h @@ -93,7 +93,6 @@ class MulCompute : public KernelLite { .Slice(param.y_num_col_dims, param.y->dims().size()) .production()); CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h"; - LOG(INFO) << x_h << " " << x_w << " " << y_h << " " << y_w; mul_compute(blas, x_data, x_h, x_w, y_data, y_h, y_w, out_data); } diff --git a/lite/kernels/cuda/sequence_pool_concat_compute.cu b/lite/kernels/cuda/sequence_pool_concat_compute.cu old mode 100755 new mode 100644 diff --git a/lite/kernels/cuda/sequence_pool_concat_compute.h b/lite/kernels/cuda/sequence_pool_concat_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu index 0a00c06cbfb9200e45d48a59aa26f2350c2cf9ab..6b4b2875f39c479f3ddd387230dbdf8e3d24ce3c 100644 --- a/lite/kernels/cuda/yolo_box_compute.cu +++ b/lite/kernels/cuda/yolo_box_compute.cu @@ -233,7 +233,7 @@ REGISTER_LITE_KERNEL(yolo_box, DATALAYOUT(kNCHW))}) .BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kCUDA), - PRECISION(kFloat), + PRECISION(kInt32), DATALAYOUT(kNCHW))}) .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kCUDA), diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt old mode 100644 new mode 100755 index 7c47e72872ecae6216288c20fa1a6ae30fac65bd..f6c3a399490a86e2ac2fcd9cbeb76fca8c8ac479 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -1,4 +1,4 @@ -if (NOT LITE_WITH_FPGA) +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA)) return() endif() diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/conv_compute.h b/lite/kernels/fpga/conv_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/dropout_compute.cc b/lite/kernels/fpga/dropout_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/fc_compute.h b/lite/kernels/fpga/fc_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/feed_compute.h b/lite/kernels/fpga/feed_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/fetch_compute.h b/lite/kernels/fpga/fetch_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/im2sequence_compute.cc b/lite/kernels/fpga/im2sequence_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/im2sequence_compute.h b/lite/kernels/fpga/im2sequence_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/mul_compute.h b/lite/kernels/fpga/mul_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/norm_compute.cc b/lite/kernels/fpga/norm_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/norm_compute.h b/lite/kernels/fpga/norm_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/prior_box_compute.h b/lite/kernels/fpga/prior_box_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index c84e996f4c5a066e17f4adebce52d1dbd3e6762d..2c516e47e494a445156898c6c2b017607c2de6ee 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,11 +1,10 @@ -if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM) +if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU) return() endif() lite_cc_library(subgraph_bridge_registry SRCS registry.cc DEPS op) - lite_cc_library(subgraph_bridge_engine SRCS engine.cc DEPS tensor op scope program) diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc index 62eb649e0e5ec07c402347af98730cabf76c2540..a4d1009f1be286e8bd8dfcdd469ff53b6681c820 100644 --- a/lite/kernels/npu/bridges/act_op.cc +++ b/lite/kernels/npu/bridges/act_op.cc @@ -43,33 +43,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Act node - auto act_node = graph->AddNode(out_name); - act_node->set_input_x(*x_node); + auto act_node = graph->Add(out_name); + auto act_op = act_node->data(); + act_op->set_input_x(*x_node->data()); // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // clipped_relu etc. - act_node->set_attr_mode(CvtActMode(op_type)); + act_op->set_attr_mode(CvtActMode(op_type)); if (op_type == "relu_clipped") { auto Relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); - act_node->set_attr_coef(Relu_clipped_coef); + act_op->set_attr_coef(Relu_clipped_coef); } else if (op_type == "relu6") { float Relu_clipped_coef = 6.f; - act_node->set_attr_coef(Relu_clipped_coef); + act_op->set_attr_coef(Relu_clipped_coef); } else if (op_type == "leaky_relu") { auto alpha = op_info->GetAttr("alpha"); - act_node->set_attr_negative_slope(alpha); + act_op->set_attr_negative_slope(alpha); } else if (op_type == "hard_sigmoid") { auto slope = op_info->GetAttr("slope"); auto offset = op_info->GetAttr("offset"); - act_node->set_attr_negative_slope(slope); - act_node->set_attr_coef(offset); + act_op->set_attr_negative_slope(slope); + act_op->set_attr_coef(offset); } return SUCCESS; } @@ -79,25 +80,27 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - sigmoid, +REGISTER_SUBGRAPH_BRIDGE(sigmoid, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, relu, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, tanh, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - relu_clipped, +REGISTER_SUBGRAPH_BRIDGE(relu, kNPU, paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(tanh, kNPU, paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(relu_clipped, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, relu6, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - leaky_relu, +REGISTER_SUBGRAPH_BRIDGE(relu6, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, abs, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - softsign, +REGISTER_SUBGRAPH_BRIDGE(leaky_relu, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - softplus, +REGISTER_SUBGRAPH_BRIDGE(abs, kNPU, paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(softsign, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - hard_sigmoid, +REGISTER_SUBGRAPH_BRIDGE(softplus, + kNPU, + paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(hard_sigmoid, + kNPU, paddle::lite::subgraph::npu::ActConverter); diff --git a/lite/kernels/npu/bridges/argmax_op.cc b/lite/kernels/npu/bridges/argmax_op.cc old mode 100755 new mode 100644 index 835d4dd1ed384b2ec8e0436317326b0d07d4e49d..3d397aab9d5cc7cfb800198184d656856d8c101f --- a/lite/kernels/npu/bridges/argmax_op.cc +++ b/lite/kernels/npu/bridges/argmax_op.cc @@ -44,20 +44,21 @@ int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { int axis = op_info->GetAttr("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Axis node - auto axis_const_node = graph->AddNode(out_name + "/axis", axis); + auto axis_node = graph->Add(out_name + "/axis", axis); // Argmax node - auto argmax_node = graph->AddNode(out_name); - argmax_node->set_input_x1(*x_node); - argmax_node->set_input_x2(*axis_const_node); + auto argmax_node = graph->Add(out_name); + auto argmax_op = argmax_node->data(); + argmax_op->set_input_x1(*x_node->data()); + argmax_op->set_input_x2(*axis_node->data()); return SUCCESS; } @@ -66,6 +67,6 @@ int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - arg_max, +REGISTER_SUBGRAPH_BRIDGE(arg_max, + kNPU, paddle::lite::subgraph::npu::ArgmaxConverter); diff --git a/lite/kernels/npu/bridges/argmax_op_test.cc b/lite/kernels/npu/bridges/argmax_op_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc index 57b52cf745ef189f4c6151940de73e9f944f72dc..d151fd8d7b35483f41190ecc789844a99e1f72de 100644 --- a/lite/kernels/npu/bridges/batch_norm_op.cc +++ b/lite/kernels/npu/bridges/batch_norm_op.cc @@ -67,30 +67,31 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { bool use_global_stats = op_info->GetAttr("use_global_stats"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Scale, Bias, Mean, Variance node - auto scale_const_node = graph->AddNode(scale_name, *scale); - auto bias_const_node = graph->AddNode(bias_name, *bias); - auto mean_const_node = graph->AddNode(mean_name, *mean); - auto variance_const_node = graph->AddNode(variance_name, *variance); + auto scale_node = graph->Add(scale_name, *scale); + auto bias_node = graph->Add(bias_name, *bias); + auto mean_node = graph->Add(mean_name, *mean); + auto variance_node = graph->Add(variance_name, *variance); // Batch Norm node - auto batch_norm_node = graph->AddNode(y_name); - batch_norm_node->set_input_x(*x_node); - batch_norm_node->set_input_scale(*scale_const_node); - batch_norm_node->set_input_offset(*bias_const_node); - batch_norm_node->set_input_mean(*mean_const_node); - batch_norm_node->set_input_variance(*variance_const_node); - batch_norm_node->set_attr_momentum(momentum); - batch_norm_node->set_attr_epsilon(epsilon); - batch_norm_node->set_attr_mode(mode); - batch_norm_node->set_attr_use_global_stats(use_global_stats); + auto batch_norm_node = graph->Add(y_name); + auto batch_norm_op = batch_norm_node->data(); + batch_norm_op->set_input_x(*x_node->data()); + batch_norm_op->set_input_scale(*scale_node->data()); + batch_norm_op->set_input_offset(*bias_node->data()); + batch_norm_op->set_input_mean(*mean_node->data()); + batch_norm_op->set_input_variance(*variance_node->data()); + batch_norm_op->set_attr_momentum(momentum); + batch_norm_op->set_attr_epsilon(epsilon); + batch_norm_op->set_attr_mode(mode); + batch_norm_op->set_attr_use_global_stats(use_global_stats); return SUCCESS; } @@ -99,6 +100,6 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - batch_norm, +REGISTER_SUBGRAPH_BRIDGE(batch_norm, + kNPU, paddle::lite::subgraph::npu::BatchNormConverter); diff --git a/lite/kernels/npu/bridges/batch_norm_op_test.cc b/lite/kernels/npu/bridges/batch_norm_op_test.cc deleted file mode 100644 index 38a876efb7c8ca6c38dee44e3c7a29a141d995d4..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/bridges/batch_norm_op_test.cc +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/batch_norm_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -template -void batch_norm_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable(); - auto bias = - scope->FindVar(op_info->Input("Bias").front())->GetMutable(); - auto scale = - scope->FindVar(op_info->Input("Scale").front())->GetMutable(); - auto mean = - scope->FindVar(op_info->Input("Mean").front())->GetMutable(); - auto variance = - scope->FindVar(op_info->Input("Variance").front())->GetMutable(); - - auto x_data = x->data(); - auto y_data = y->mutable_data(); - auto scale_data = scale->mutable_data(); - auto bias_data = bias->mutable_data(); - auto mean_data = mean->mutable_data(); - auto variance_data = variance->mutable_data(); - DDim x_dims = x->dims(); - - float epsilon = op_info->GetAttr("epsilon"); - float momentum = op_info->GetAttr("momentum"); - auto data_layout = op_info->GetAttr("data_layout"); - - bool global_stats = op_info->GetAttr("use_global_stats"); - if (global_stats) { - int64_t outer_size = 0; - int64_t channel_size = 0; - int64_t inner_size = 0; - if (data_layout == "NCHW") { - outer_size = x_dims[0]; - channel_size = x_dims[1]; - inner_size = x_dims.Slice(2, x_dims.size()).production(); - } else { - LOG(FATAL) << "Unknown storage order: " << data_layout; - } - auto x_ptr = x_data; - auto y_ptr = y_data; - for (int o = 0; o < outer_size; o++) { - for (int c = 0; c < channel_size; c++) { - for (int i = 0; i < inner_size; i++) { - dtype norm_x = - (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); - *y_ptr = norm_x * scale_data[c] + bias_data[c]; - x_ptr++; - y_ptr++; - } - } - } - } -} - -void test_batch_norm( - int bs, int ic, int ih, int iw, float epsilon, float momentum) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - std::string scale_var_name = "scale"; - std::string bias_var_name = "bias"; - std::string mean_var_name = "mean"; - std::string variance_var_name = "variance"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* scale = scope.Var(scale_var_name)->GetMutable(); - auto* bias = scope.Var(bias_var_name)->GetMutable(); - auto* mean = scope.Var(mean_var_name)->GetMutable(); - auto* variance = scope.Var(variance_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - scale->Resize({ic}); - bias->Resize({ic}); - mean->Resize({ic}); - variance->Resize({ic}); - - // initialize input&output data - FillTensor(x); - FillTensor(scale); - FillTensor(bias); - FillTensor(mean); - // variance > 0 - FillTensor(variance, 1.f, 5.f); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("batch_norm"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetInput("Scale", {scale_var_name}); - opdesc.SetInput("Bias", {bias_var_name}); - opdesc.SetInput("Mean", {mean_var_name}); - opdesc.SetInput("Variance", {variance_var_name}); - opdesc.SetOutput("Y", {out_var_name}); - opdesc.SetAttr("is_test", 1); - opdesc.SetAttr("use_global_stats", true); - opdesc.SetAttr("epsilon", epsilon); - opdesc.SetAttr("momentum", momentum); - opdesc.SetAttr("data_layout", std::string("NCHW")); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - batch_norm_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, batch_norm) { - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto epsilon : {1e-4f, 1e-5f}) { - for (auto momentum : {0.9f, 0.99f}) { - test_batch_norm(bs, ic, ih, iw, epsilon, momentum); - } - } - } - } - } - } -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(batch_norm); -USE_NPU_BRIDGE(batch_norm); diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc index 44a2734c89d9fc3982dd3b934fac2d314bf600f3..e40af8703dd1dda7303f0976fa03abec7cdf7aaa 100644 --- a/lite/kernels/npu/bridges/concat_op.cc +++ b/lite/kernels/npu/bridges/concat_op.cc @@ -44,21 +44,22 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Traverse all of input nodes which are added into the new created concat // node - auto concat_node = graph->AddNode(out_name); - concat_node->set_attr_axis(axis); - concat_node->set_attr_N(num); - concat_node->create_dynamic_input_x(num); + auto concat_node = graph->Add(out_name); + auto concat_op = concat_node->data(); + concat_op->set_attr_axis(axis); + concat_op->set_attr_N(num); + concat_op->create_dynamic_input_x(num); int idx = 1; for (auto& x_name : x_names) { auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } - concat_node->set_dynamic_input_x(idx, *x_node); + concat_op->set_dynamic_input_x(idx, *x_node->data()); idx++; } return SUCCESS; @@ -69,6 +70,6 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - concat, +REGISTER_SUBGRAPH_BRIDGE(concat, + kNPU, paddle::lite::subgraph::npu::ConcatConverter); diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc index 6b34e76880f54a3cc221a6d1e2e539214b0f79f9..60877f768b2ea691d99635944d4f1dbef7365fb3 100644 --- a/lite/kernels/npu/bridges/conv_op.cc +++ b/lite/kernels/npu/bridges/conv_op.cc @@ -67,11 +67,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(dilations.size(), 2L); // Input node - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } if (paddings.size() == 2L) { @@ -109,104 +109,102 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Filter node - auto filter_const_node = graph->AddNode(filter_name, *filter); + auto filter_node = graph->Add(filter_name, *filter); // Add bias node if exists bias // Supports the bias nodes with the following dimensions // 0: {oc} // 1: {1, oc, oh, ow} // 2: {n, oc, oh, ow} - std::shared_ptr bias_node = nullptr; + std::shared_ptr bias_node = nullptr; bool is_channel_bias = false; if (HasInputArg(op_info, scope, "Bias")) { auto bias_name = op_info->Input("Bias").front(); - auto bias_type = kernel->GetInputDeclType("Bias"); - CHECK(bias_type->precision() == PRECISION(kFloat)); - CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); - auto bias = scope->FindMutableTensor(bias_name); - auto bias_dims = bias->dims(); - auto bias_data_size = bias_dims.production(); - auto output_data_size = output_dims.production(); - std::vector bias_shape; - if (bias_data_size == oc) { - // 0: {oc} - bias_shape = {1, oc, 1, 1}; - is_channel_bias = true; - } else if (bias_data_size == output_data_size / bs) { - // 1: {1, oc, oh, ow} - bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; - } else if (bias_data_size == output_data_size) { - // 2: {n, oc, oh, ow} - bias_shape = output_dims.Vectorize(); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); } else { - LOG(WARNING) << "[NPU] Bias dimension " << bias_dims - << " isn't supported in conv2d Op when output dimension is " - << output_dims; - return FAILED; - } - if (graph->HasNode(bias_name)) { - // Bias node from input node - bias_node = graph->GetNode(bias_name); - } else { - // Bias node with const data - bias_node = graph->AddNode(bias_name, *bias, bias_shape); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + auto bias_data_size = bias_dims.production(); + auto output_data_size = output_dims.production(); + std::vector bias_shape; + if (bias_data_size == oc) { + // 0: {oc} + bias_shape = {1, oc, 1, 1}; + is_channel_bias = true; + } else if (bias_data_size == output_data_size / bs) { + // 1: {1, oc, oh, ow} + bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; + } else if (bias_data_size == output_data_size) { + // 2: {n, oc, oh, ow} + bias_shape = output_dims.Vectorize(); + } else { + LOG(WARNING) + << "[NPU] Bias dimension " << bias_dims + << " isn't supported in conv2d Op when output dimension is " + << output_dims; + return FAILED; + } + bias_node = graph->Add(bias_name, *bias, bias_shape); } } // Conv node - std::shared_ptr conv_node = nullptr; + std::shared_ptr conv_node = nullptr; if (use_depthwise_conv && is_depthwise_mode) { - auto depthwise_conv_node = - graph->AddNode(output_name); - depthwise_conv_node->set_input_x(*input_node); - depthwise_conv_node->set_input_filter(*filter_const_node); - depthwise_conv_node->set_attr_mode(1); - depthwise_conv_node->set_attr_algo(0); - depthwise_conv_node->set_attr_format(0); // NCHW - depthwise_conv_node->set_attr_pad_mode(5); // VALID - depthwise_conv_node->set_attr_group(groups); - depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( + conv_node = graph->Add(output_name); + auto conv_op = conv_node->data(); + conv_op->set_input_x(*input_node->data()); + conv_op->set_input_filter(*filter_node->data()); + conv_op->set_attr_mode(1); + conv_op->set_attr_algo(0); + conv_op->set_attr_format(0); // NCHW + conv_op->set_attr_pad_mode(5); // VALID + conv_op->set_attr_group(groups); + conv_op->set_attr_pad(ge::AttrValue::LIST_INT( {paddings[0], paddings[1], paddings[2], paddings[3]})); - depthwise_conv_node->set_attr_dilation( + conv_op->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - depthwise_conv_node->set_attr_stride( - ge::AttrValue::LIST_INT({strides[0], strides[1]})); - depthwise_conv_node->set_attr_kernel( + conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]})); + conv_op->set_attr_kernel( ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); - conv_node = depthwise_conv_node; // ConvolutionDepthwise Op doesn't support bias, so append Add node to // support bias if (bias_node != nullptr) { - auto add_node = graph->AddNode(output_name); - add_node->set_input_x1(*depthwise_conv_node); - add_node->set_input_x2(*bias_node); + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_node->data()); + add_op->set_input_x2(*bias_node->data()); conv_node = add_node; } } else { - auto common_conv_node = graph->AddNode(output_name); - common_conv_node->set_input_x(*input_node); - common_conv_node->set_input_w(*filter_const_node); - common_conv_node->set_attr_mode(1); - common_conv_node->set_attr_pad_mode(0); // NOTSET - common_conv_node->set_attr_group(groups); - common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( + conv_node = graph->Add(output_name); + auto conv_op = conv_node->data(); + conv_op->set_input_x(*input_node->data()); + conv_op->set_input_w(*filter_node->data()); + conv_op->set_attr_mode(1); + conv_op->set_attr_pad_mode(0); // NOTSET + conv_op->set_attr_group(groups); + conv_op->set_attr_pad(ge::AttrValue::LIST_INT( {paddings[0], paddings[0], paddings[2], paddings[2]})); - common_conv_node->set_attr_dilation( + conv_op->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - common_conv_node->set_attr_stride( - ge::AttrValue::LIST_INT({strides[0], strides[1]})); - common_conv_node->set_attr_kernel( + conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]})); + conv_op->set_attr_kernel( ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); - conv_node = common_conv_node; // Convolution Op only support bias with dimension {1, oc, 1, 1}, // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow) if (bias_node != nullptr) { if (is_channel_bias) { - common_conv_node->set_input_b(*bias_node); + conv_op->set_input_b(*bias_node->data()); } else { - auto add_node = graph->AddNode(output_name); - add_node->set_input_x1(*common_conv_node); - add_node->set_input_x2(*bias_node); + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_node->data()); + add_op->set_input_x2(*bias_node->data()); conv_node = add_node; } } @@ -215,9 +213,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (fuse_relu) { // Append relu node if fuse_relu is true - auto relu_node = graph->AddNode(output_name); - relu_node->set_input_x(*conv_node); - relu_node->set_attr_mode(CvtActMode("relu")); + auto relu_node = graph->Add(output_name); + auto relu_op = relu_node->data(); + relu_op->set_input_x(*conv_node->data()); + relu_op->set_attr_mode(CvtActMode("relu")); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -227,9 +226,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - conv2d, +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kNPU, paddle::lite::subgraph::npu::ConvConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - depthwise_conv2d, +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kNPU, paddle::lite::subgraph::npu::ConvConverter); diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc index 5ac0723c7841ae08290a1d0dfeb1265a855f8fde..ab31a920ec045c8d12139b804abbcca94f3e009a 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op.cc @@ -58,11 +58,11 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(dilations.size(), 2L); // Input node - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } // Create input sizes node to describe the dimensions of input tensor @@ -83,55 +83,59 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { (input_dims[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i]; input_sizes.push_back(output_size); } - auto input_sizes_const_node = - graph->AddNode(output_name + "/input_sizes", input_sizes); + auto input_sizes_node = graph->Add(output_name + "/input_sizes", input_sizes); // Filter node - auto filter_const_node = graph->AddNode(filter_name, *filter); + auto filter_node = graph->Add(filter_name, *filter); // Deconv node - auto conv_transpose_node = graph->AddNode(output_name); - conv_transpose_node->set_input_input_sizes(*input_sizes_const_node); - conv_transpose_node->set_input_filter(*filter_const_node); - conv_transpose_node->set_input_x(*input_node); + auto conv_transpose_node = graph->Add(output_name); + auto conv_transpose_op = conv_transpose_node->data(); + conv_transpose_op->set_input_input_sizes(*input_sizes_node->data()); + conv_transpose_op->set_input_filter(*filter_node->data()); + conv_transpose_op->set_input_x(*input_node->data()); // Set attributes - conv_transpose_node->set_attr_format(0); // NCHW - conv_transpose_node->set_attr_pad_mode(0); // NOTSET - conv_transpose_node->set_attr_group(groups); - conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT( + conv_transpose_op->set_attr_format(0); // NCHW + conv_transpose_op->set_attr_pad_mode(0); // NOTSET + conv_transpose_op->set_attr_group(groups); + conv_transpose_op->set_attr_pad(ge::AttrValue::LIST_INT( {paddings[0], paddings[1], paddings[2], paddings[3]})); - conv_transpose_node->set_attr_dilation( + conv_transpose_op->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - conv_transpose_node->set_attr_stride( + conv_transpose_op->set_attr_stride( ge::AttrValue::LIST_INT({strides[0], strides[1]})); - conv_transpose_node->set_attr_kernel( + conv_transpose_op->set_attr_kernel( ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); // Append add node to add bias if exists bias - std::shared_ptr output_node = conv_transpose_node; if (HasInputArg(op_info, scope, "Bias")) { - // Create bias node + std::shared_ptr bias_node = nullptr; auto bias_name = op_info->Input("Bias").front(); - auto bias_type = kernel->GetInputDeclType("Bias"); - CHECK(bias_type->precision() == PRECISION(kFloat)); - CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); - auto bias = scope->FindMutableTensor(bias_name); - auto channel_size = bias->dims().production(); - CHECK_EQ(channel_size, filter_dims[1] * groups); - auto bias_const_node = - graph->AddNode(bias_name, *bias, {1, channel_size, 1, 1}); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto channel_size = bias->dims().production(); + CHECK_EQ(channel_size, filter_dims[1] * groups); + bias_node = graph->Add(bias_name, *bias, {1, channel_size, 1, 1}); + } // Append add node to add bias node - auto add_node = graph->AddNode(output_name); - add_node->set_input_x1(*conv_transpose_node); - add_node->set_input_x2(*bias_const_node); - output_node = add_node; + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_transpose_node->data()); + add_op->set_input_x2(*bias_node->data()); + conv_transpose_node = add_node; } if (fuse_relu) { // Append relu node if fuse_relu is true - auto relu_node = graph->AddNode(output_name); - relu_node->set_input_x(*output_node); - relu_node->set_attr_mode(CvtActMode("relu")); + auto relu_node = graph->Add(output_name); + auto relu_op = relu_node->data(); + relu_op->set_input_x(*conv_transpose_node->data()); + relu_op->set_attr_mode(CvtActMode("relu")); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -141,6 +145,6 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - conv2d_transpose, +REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose, + kNPU, paddle::lite::subgraph::npu::ConvTransposeConverter); diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc index a31a1426dc5dc8c537a05bf44287d7256be7085a..69b77b5def4faf9722fdbd8bd6f2480e67e2c160 100644 --- a/lite/kernels/npu/bridges/elementwise_ops.cc +++ b/lite/kernels/npu/bridges/elementwise_ops.cc @@ -74,45 +74,45 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto axis = op_info->GetAttr("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Y node - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); } else { auto y_new_shape = CvtYShape(x_dims, y_dims, axis); - y_node = graph->AddNode(y_name, y_new_shape); + y_node = graph->Add(y_name, *y, y_new_shape); } // Elementwise node - std::shared_ptr elementwise_node = nullptr; + std::shared_ptr elt_node = nullptr; if (op_type == "elementwise_add" || op_type == "fusion_elementwise_add_activation") { - auto elt_node = graph->AddNode(out_name); - elt_node->set_input_x1(*x_node); - elt_node->set_input_x2(*y_node); - elementwise_node = elt_node; + elt_node = graph->Add(out_name); + auto elt_op = elt_node->data(); + elt_op->set_input_x1(*x_node->data()); + elt_op->set_input_x2(*y_node->data()); } else if (op_type == "elementwise_sub") { - auto elt_node = graph->AddNode(out_name); - elt_node->set_input_x1(*x_node); - elt_node->set_input_x2(*y_node); - elementwise_node = elt_node; + elt_node = graph->Add(out_name); + auto elt_op = elt_node->data(); + elt_op->set_input_x1(*x_node->data()); + elt_op->set_input_x2(*y_node->data()); } else if (op_type == "elementwise_mul") { - auto elt_node = graph->AddNode(out_name); - elt_node->set_input_x(*x_node); - elt_node->set_input_y(*y_node); - elementwise_node = elt_node; + elt_node = graph->Add(out_name); + auto elt_op = elt_node->data(); + elt_op->set_input_x(*x_node->data()); + elt_op->set_input_y(*y_node->data()); } else if (op_type == "elementwise_div") { - auto elt_node = graph->AddNode(out_name); - elt_node->set_input_x1(*x_node); - elt_node->set_input_x2(*y_node); - elementwise_node = elt_node; + elt_node = graph->Add(out_name); + auto elt_op = elt_node->data(); + elt_op->set_input_x1(*x_node->data()); + elt_op->set_input_x2(*y_node->data()); } else { LOG(WARNING) << "[NPU] Unsupported op type: " << op_type; return FAILED; @@ -121,11 +121,12 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Act node if (op_type == "fusion_elementwise_add_activation") { auto act_type = op_info->GetAttr("act_type"); - auto act_node = graph->AddNode(out_name); - act_node->set_input_x(*elementwise_node); + auto act_node = graph->Add(out_name); + auto act_op = act_node->data(); + act_op->set_input_x(*elt_node->data()); // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // clipped_relu etc. - act_node->set_attr_mode(CvtActMode(act_type)); + act_op->set_attr_mode(CvtActMode(act_type)); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -135,18 +136,18 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - elementwise_add, +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - fusion_elementwise_add_activation, +REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - elementwise_sub, +REGISTER_SUBGRAPH_BRIDGE(elementwise_sub, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - elementwise_mul, +REGISTER_SUBGRAPH_BRIDGE(elementwise_mul, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - elementwise_div, +REGISTER_SUBGRAPH_BRIDGE(elementwise_div, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc old mode 100755 new mode 100644 index e7e35831dd7cc4477dfac31a72884e5e5ea19483..546a235148420e26d746ff730e22b2170e301cd6 --- a/lite/kernels/npu/bridges/engine.cc +++ b/lite/kernels/npu/bridges/engine.cc @@ -57,9 +57,11 @@ int Engine::BuildOriginProgram() { VLOG(3) << "The attr '" << kKernelTypeAttr << "' not found, pick the first kernel for " << op_type; #if defined(LITE_WITH_ARM) - auto kernels = op->CreateKernels({Place{TARGET(kARM)}}); + auto kernels = + op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}}); #elif defined(LITE_WITH_X86) - auto kernels = op->CreateKernels({Place{TARGET(kX86)}}); + auto kernels = + op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}}); #endif CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type; picked_kernel = std::move(kernels.front()); diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h old mode 100755 new mode 100644 diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc index 7b66d545651f5e41c65f36e743d74f0508daf8b3..3d028172154e58c1ed191b4d4eb780e9937458a5 100644 --- a/lite/kernels/npu/bridges/fc_op.cc +++ b/lite/kernels/npu/bridges/fc_op.cc @@ -57,22 +57,24 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " m: " << m << " k: " << k << " n: " << n; // Create input node and reshape it to (m, k, 1, 1) - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } auto reshaped_input_node = - graph->AddNode(input_name + "/reshape"); - reshaped_input_node->set_input_tensor(*input_node); - reshaped_input_node->set_attr_shape({m, k, 1, 1}); - reshaped_input_node->set_attr_axis(0); + graph->Add(input_name + "/reshape"); + auto reshaped_input_op = reshaped_input_node->data(); + reshaped_input_op->set_input_tensor(*input_node->data()); + reshaped_input_op->set_attr_shape({m, k, 1, 1}); + reshaped_input_op->set_attr_axis(0); // Create w const node, set its shape to (n, k, 1, 1) and fill with // the transposed w tensor Tensor transpose_w; transpose_w.Resize({n, k, 1, 1}); + transpose_w.set_persistable(true); auto transpose_w_data = transpose_w.mutable_data(); auto w_data = w->mutable_data(); for (int i = 0; i < k; i++) { @@ -80,29 +82,36 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { transpose_w_data[j * k + i] = w_data[i * n + j]; } } - auto trans_w_const_node = graph->AddNode(w_name, transpose_w); + auto trans_w_node = graph->Add(w_name, transpose_w); // FC node - auto fc_node = graph->AddNode(out_name + "/fc"); - fc_node->set_input_x(*reshaped_input_node); - fc_node->set_input_w(*trans_w_const_node); + auto fc_node = graph->Add(out_name + "/fc"); + auto fc_op = fc_node->data(); + fc_op->set_input_x(*reshaped_input_node->data()); + fc_op->set_input_w(*trans_w_node->data()); // Add bias node if bias tensor exists if (HasInputArg(op_info, scope, "Bias")) { + std::shared_ptr bias_node = nullptr; auto bias_name = op_info->Input("Bias").front(); - auto bias_type = kernel->GetInputDeclType("Bias"); - CHECK(bias_type->precision() == PRECISION(kFloat)); - CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); - auto bias = scope->FindMutableTensor(bias_name); - auto bias_dims = bias->dims(); - CHECK_EQ(bias_dims.production(), n); - auto bias_const_node = graph->AddNode(bias_name, *bias, {1, n, 1, 1}); - fc_node->set_input_b(*bias_const_node); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + CHECK_EQ(bias_dims.production(), n); + bias_node = graph->Add(bias_name, *bias, {1, n, 1, 1}); + } + fc_op->set_input_b(*bias_node->data()); } // Reshape output of FC node from (m, n, 1, 1) to (m, n) - auto reshaped_fc_node = graph->AddNode(out_name); - reshaped_fc_node->set_input_tensor(*fc_node); - reshaped_fc_node->set_attr_shape({m, n}); - reshaped_fc_node->set_attr_axis(0); + auto reshaped_fc_node = graph->Add(out_name); + auto reshaped_fc_op = reshaped_fc_node->data(); + reshaped_fc_op->set_input_tensor(*fc_node->data()); + reshaped_fc_op->set_attr_shape({m, n}); + reshaped_fc_op->set_attr_axis(0); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -111,4 +120,4 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, fc, paddle::lite::subgraph::npu::FCConverter); +REGISTER_SUBGRAPH_BRIDGE(fc, kNPU, paddle::lite::subgraph::npu::FCConverter); diff --git a/lite/kernels/npu/bridges/graph.cc b/lite/kernels/npu/bridges/graph.cc old mode 100755 new mode 100644 index 48ebfd567320f9b55d104a350a1ba35176fd47b4..7d3afd92bfb645b4914346be155abe80026e42ef --- a/lite/kernels/npu/bridges/graph.cc +++ b/lite/kernels/npu/bridges/graph.cc @@ -21,26 +21,52 @@ namespace lite { namespace subgraph { namespace npu { -// Const node -std::shared_ptr Graph::AddNode(const std::string& name, - const Tensor& tensor, - std::vector shape, - PrecisionType precision, - DataLayoutType layout) { - auto node = AddNode(name, precision, layout); - node->set_attr_value(CvtTensor(tensor, shape, precision, layout)); +int Graph::Add(const std::string& name, std::shared_ptr node) { + auto it = nodes_.find(name); + if (it != nodes_.end()) { + // Only variable node can be shared with the same name + if (!node->is_var() || !it->second.back()->is_var()) { + LOG(FATAL) << "[NPU] Const or data node " << name << " is redefined."; + return -1; + } + } else { + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; + } + it->second.push_back(node); + return it->second.size(); +} + +// Const or data node +std::shared_ptr Graph::Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout) { + std::shared_ptr node = nullptr; + PrecisionType precision = tensor.precision(); + if (tensor.persistable()) { + // Const node + node = Add(name, precision, layout); + node->data()->set_attr_value( + CvtTensor(tensor, shape, layout)); + } else { + // Data node + node = Add(name, shape, precision, layout); + } return node; } // Data node -std::shared_ptr Graph::AddNode(const std::string& name, - std::vector shape, - PrecisionType precision, - DataLayoutType layout) { - auto node = AddNode(name); +std::shared_ptr Graph::Add(const std::string& name, + std::vector shape, + PrecisionType precision, + DataLayoutType layout) { + auto node = Add(name, precision, layout); ge::TensorDesc desc( ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision)); - node->update_input_desc_x(desc); + node->data()->update_input_desc_x(desc); return node; } diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h old mode 100755 new mode 100644 index 9b6e49c5e9e12ae6fc5ba38192ccfe487fcc0c07..cc4a7e2a7ce062090ca890d90e21aa643e37a0d3 --- a/lite/kernels/npu/bridges/graph.h +++ b/lite/kernels/npu/bridges/graph.h @@ -19,7 +19,7 @@ #include #include #include -#include "ai_ddk_lib/include/graph/op/all_ops.h" +#include "graph/op/all_ops.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" @@ -28,105 +28,94 @@ namespace lite { namespace subgraph { namespace npu { -// Type of graph nodes -class Type { +// Graph and node is defined to collect all of converted HiAI IR nodes +class Node { public: - Type(PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW), - bool persistable = false) - : precision_(precision), layout_(layout), persistable_(persistable) {} - + enum class Role { + kVar = 0, + kConst, + kData, + }; + + Node(std::shared_ptr data, + PrecisionType precision, + DataLayoutType layout, + Role role) + : data_(data), precision_(precision), layout_(layout), role_(role) {} + Node(PrecisionType precision, DataLayoutType layout, Role role) + : precision_(precision), layout_(layout), role_(role) {} + + void set_data(std::shared_ptr data) { data_ = data; } void set_precision(PrecisionType precision) { precision_ = precision; } void set_layout(DataLayoutType layout) { layout_ = layout; } - bool set_persistable(bool persistable) { persistable_ = persistable; } + void set_role(Role role) { role_ = role; } + template + std::shared_ptr data() { + return std::static_pointer_cast(data_); + } + std::shared_ptr data() { return data_; } PrecisionType precision() const { return precision_; } DataLayoutType layout() const { return layout_; } - bool persistable() const { return persistable_; } + bool is_var() const { return role_ == Role::kVar; } + bool is_const() const { return role_ == Role::kConst; } + bool is_data() const { return role_ == Role::kData; } private: + std::shared_ptr data_{nullptr}; PrecisionType precision_{PRECISION(kFloat)}; DataLayoutType layout_{DATALAYOUT(kNCHW)}; - bool persistable_{false}; + Role role_{Role::kVar}; }; -// Graph to collect all of converted HiAI IR nodes class Graph { public: + int Add(const std::string& name, std::shared_ptr node); + + // Variable, const or data node template - std::shared_ptr AddNode(const std::string& name, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - auto unique_name = [&](const std::string& key) { - int idx = 1; - auto it = counts_.find(key); - if (it == counts_.end()) { - counts_.insert(std::make_pair(key, idx)); - } else { - idx = ++(it->second); - } - return key + "_" + std::to_string(idx); - }; - bool persistable = typeid(T) == typeid(ge::op::Const); - auto it = nodes_.find(name); - if (it != nodes_.end()) { - // Only variable can rebind the name - CHECK(!it->second.second.persistable() && !persistable) - << "[NPU] Node " << name << " redefined."; - // Generate a new unique name as the key to bind the origin node: - // new_name->node - nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second)); - nodes_.erase(it); + std::shared_ptr Add(const std::string& name, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + Node::Role role = Node::Role::kVar; + if (typeid(T) == typeid(ge::op::Const)) { + role = Node::Role::kConst; + } else if (typeid(T) == typeid(ge::op::Data)) { + role = Node::Role::kData; } - // Create a new node and bind with the name: name->new_node - auto node = std::make_shared(unique_name(name + "_op")); - nodes_.insert(std::make_pair( - name, std::make_pair(node, Type(precision, layout, persistable)))); + auto node = std::make_shared(precision, layout, role); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + // Generate a unique name for the created HiAI IR + node->set_data(std::make_shared(name + "__" + std::to_string(idx))); return node; } - // Const node - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, tensor, tensor.dims().Vectorize(), precision, layout); + // Const or data node + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, tensor.dims().Vectorize(), layout); } - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - std::vector shape, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - DDim dims, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, tensor, dims.Vectorize(), precision, layout); + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, dims.Vectorize(), layout); } + // Const node template - std::shared_ptr AddNode( - const std::string& name, - const std::vector& data, - std::vector shape = {}, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - const std::type_info& info = typeid(T); - PrecisionType precision = PRECISION(kFloat); - if (info == typeid(float)) { - precision = PRECISION(kFloat); - } else if (info == typeid(int8_t)) { - precision = PRECISION(kFloat); - } else if (info == typeid(int32_t)) { - precision = PRECISION(kInt32); - } else { - LOG(FATAL) << "[NPU] Unknow data type " << info.name(); - } + std::shared_ptr Add(const std::string& name, + const std::vector& data, + std::vector shape = {}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { if (shape.empty()) { shape = {static_cast(data.size())}; } else { @@ -138,78 +127,66 @@ class Graph { } Tensor tensor; tensor.Resize(shape); + tensor.set_persistable(true); std::memcpy(reinterpret_cast(tensor.mutable_data()), reinterpret_cast(data.data()), data.size() * sizeof(T)); - return AddNode(name, tensor, precision, layout); + return Add(name, tensor, layout); } template - std::shared_ptr AddNode( - const std::string& name, - const std::vector& data, - DDim dims, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, data, dims.Vectorize(), layout); + std::shared_ptr Add(const std::string& name, + const std::vector& data, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, data, dims.Vectorize(), layout); } template - std::shared_ptr AddNode( - const std::string& name, - T value, - std::vector shape = {1}, - DataLayoutType layout = DATALAYOUT(kNCHW)) { + std::shared_ptr Add(const std::string& name, + T value, + std::vector shape = {1}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { int64_t size = 1; for (auto i : shape) { size *= i; } std::vector data(size, value); - return AddNode(name, data, shape, layout); + return Add(name, data, shape, layout); } template - std::shared_ptr AddNode( - const std::string& name, - T value, - DDim dims, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, value, dims.Vectorize(), layout); + std::shared_ptr Add(const std::string& name, + T value, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, value, dims.Vectorize(), layout); } // Data node - std::shared_ptr AddNode( - const std::string& name, - std::vector shape, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - DDim dims, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, dims.Vectorize(), precision, layout); - } - - std::shared_ptr GetNode(std::string name) { - CHECK(HasNode(name)) << "[NPU] Node " << name << " not found."; - return nodes_.at(name).first; + std::shared_ptr Add(const std::string& name, + std::vector shape, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, dims.Vectorize(), precision, layout); } - const Type& GetType(const std::string& name) { - CHECK(HasNode(name)) << "[NPU] Node " << name << " not found."; - return nodes_.at(name).second; + std::shared_ptr Get(std::string name) { + CHECK(Has(name)) << "[NPU] Node " << name << " not found."; + return nodes_.at(name).back(); } - bool HasNode(const std::string& name) { + bool Has(const std::string& name) { return nodes_.find(name) != nodes_.end(); } private: - std::unordered_map, Type>> - nodes_; - std::unordered_map counts_; + std::unordered_map>> nodes_; }; } // namespace npu diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc index f95ebc347a86051aba23e0d61799ab2efb5c5567..238200abf3d37c8967a371f6ada4d6b6d6223b4d 100644 --- a/lite/kernels/npu/bridges/interpolate_op.cc +++ b/lite/kernels/npu/bridges/interpolate_op.cc @@ -55,11 +55,11 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { "supported in HiAI DDK"; // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Priority: OutSize > scale > out_h/out_w @@ -71,17 +71,18 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Update out_h and out_w and create out_size node if has OutSize - std::shared_ptr out_size_node = nullptr; + std::shared_ptr out_size_node = nullptr; if (HasInputArg(op_info, scope, "OutSize")) { auto out_size_name = op_info->Input("OutSize").front(); auto out_size_type = kernel->GetInputDeclType("OutSize"); CHECK(out_size_type->precision() == PRECISION(kInt32)); CHECK(out_size_type->layout() == DATALAYOUT(kNCHW)); - if (graph->HasNode(out_size_name)) { - out_size_node = graph->GetNode(out_size_name); + if (graph->Has(out_size_name)) { + out_size_node = graph->Get(out_size_name); } else { auto out_size = scope->FindMutableTensor(out_size_name); CHECK_EQ(out_size->numel(), 2); + CHECK(out_size->persistable()); auto out_size_data = out_size->mutable_data(); // Update out_h and out_w if has OutSize out_h = out_size_data[0]; @@ -97,22 +98,25 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " is too large, should not exceed " << largest_multiple << " in HiAI DDK"; } - out_size_node = graph->AddNode(out_name + "/out_size", - std::vector({out_h, out_w})); + out_size_node = + graph->Add(out_name + "/out_size", std::vector({out_h, out_w})); } if (interp_method == "bilinear") { - auto bilinear_interp_node = - graph->AddNode(out_name); - bilinear_interp_node->set_input_x(*x_node); - bilinear_interp_node->set_input_size(*out_size_node); - bilinear_interp_node->set_attr_align_corners(align_corners); + auto bilinear_interp_node = graph->Add(out_name); + auto bilinear_interp_op = + bilinear_interp_node->data(); + bilinear_interp_op->set_input_x(*x_node->data()); + bilinear_interp_op->set_input_size(*out_size_node->data()); + bilinear_interp_op->set_attr_align_corners(align_corners); } else if (interp_method == "nearest") { auto nearest_interp_node = - graph->AddNode(out_name); - nearest_interp_node->set_input_image(*x_node); - nearest_interp_node->set_input_size(*out_size_node); - nearest_interp_node->set_attr_align_corners(align_corners); + graph->Add(out_name); + auto nearest_interp_op = + nearest_interp_node->data(); + nearest_interp_op->set_input_image(*x_node->data()); + nearest_interp_op->set_input_size(*out_size_node->data()); + nearest_interp_op->set_attr_align_corners(align_corners); } else { LOG(WARNING) << "[NPU] Unsupported interpolate method: " << interp_method; return FAILED; @@ -125,9 +129,9 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - bilinear_interp, +REGISTER_SUBGRAPH_BRIDGE(bilinear_interp, + kNPU, paddle::lite::subgraph::npu::InterpolateConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - nearest_interp, +REGISTER_SUBGRAPH_BRIDGE(nearest_interp, + kNPU, paddle::lite::subgraph::npu::InterpolateConverter); diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc index f63b6826b98cdf5f2f8207376d367ee5f89e0c51..27df45819537faed291e108cc8a78a9a9de202cf 100644 --- a/lite/kernels/npu/bridges/mul_op.cc +++ b/lite/kernels/npu/bridges/mul_op.cc @@ -56,45 +56,46 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { << "[NPU] columns of X must be equal with rows of Y"; int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production(); VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k; - VLOG(3) << "x_name:" << x_name << ", is data: " << graph->HasNode(x_name); - VLOG(3) << "y_name:" << y_name << ", is data: " << graph->HasNode(y_name); - CHECK(graph->HasNode(x_name)) + VLOG(3) << "x_name:" << x_name << ", is data: " << graph->Has(x_name); + VLOG(3) << "y_name:" << y_name << ", is data: " << graph->Has(y_name); + CHECK(graph->Has(x_name)) << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet."; // X node which supports persistable and non-persistable tensor, and // reshape to (m, k) - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); - auto reshaped_x_node = graph->AddNode(x_name + "/reshape"); - reshaped_x_node->set_input_tensor(*x_node); - reshaped_x_node->set_attr_shape({m, k}); - reshaped_x_node->set_attr_axis(0); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + auto reshaped_x_node = graph->Add(x_name + "/reshape"); + auto reshaped_x_op = reshaped_x_node->data(); + reshaped_x_op->set_input_tensor(*x_node->data()); + reshaped_x_op->set_attr_shape({m, k}); + reshaped_x_op->set_attr_axis(0); x_node = reshaped_x_node; } else { - auto x_const_node = graph->AddNode(x_name, *x, {m, k}); - x_node = x_const_node; + x_node = graph->Add(x_name, *x, {m, k}); } // Y node which only supports persistable tensor, and reshape to // (k,n) - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); - auto reshaped_y_node = graph->AddNode(y_name + "/reshape"); - reshaped_y_node->set_input_tensor(*y_node); - reshaped_y_node->set_attr_shape({k, n}); - reshaped_y_node->set_attr_axis(0); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); + auto reshaped_y_node = graph->Add(y_name + "/reshape"); + auto reshaped_y_op = reshaped_y_node->data(); + reshaped_y_op->set_input_tensor(*y_node->data()); + reshaped_y_op->set_attr_shape({k, n}); + reshaped_y_op->set_attr_axis(0); y_node = reshaped_y_node; } else { - auto y_const_node = graph->AddNode(y_name, *y, {k, n}); - y_node = y_const_node; + y_node = graph->Add(y_name, *y, {k, n}); } // Matmul node - auto mul_node = graph->AddNode(out_name); - mul_node->set_input_x1(*x_node); - mul_node->set_input_x2(*y_node); + auto mul_node = graph->Add(out_name); + auto mul_op = mul_node->data(); + mul_op->set_input_x1(*x_node->data()); + mul_op->set_input_x2(*y_node->data()); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -103,4 +104,4 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, mul, paddle::lite::subgraph::npu::MulConverter); +REGISTER_SUBGRAPH_BRIDGE(mul, kNPU, paddle::lite::subgraph::npu::MulConverter); diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc index 451f48b1df6c7fedf0505ad5c6165f2d43514966..e6852da78776808dfa7d0f9a75b1b2fe077190b6 100644 --- a/lite/kernels/npu/bridges/pad2d_op.cc +++ b/lite/kernels/npu/bridges/pad2d_op.cc @@ -45,35 +45,34 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(padding.size(), 4); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Padding node int xds = x_dims.size(); padding.insert(padding.begin(), xds * 2 - 4, 0); - auto padding_const_node = - graph->AddNode(out_name + "/padding", padding, {xds, 2}); + auto padding_node = graph->Add(out_name + "/padding", padding, {xds, 2}); // Pad node - auto pad2d_node = graph->AddNode(out_name); - pad2d_node->set_input_x(*x_node); - pad2d_node->set_input_padding(*padding_const_node); + auto pad2d_node = graph->Add(out_name); + auto pad2d_op = pad2d_node->data(); + pad2d_op->set_input_x(*x_node->data()); + pad2d_op->set_input_padding(*padding_node->data()); auto mode = op_info->GetAttr("mode"); if (mode == "constant") { // Pad value node auto pad_value = op_info->GetAttr("pad_value"); - auto pad_value_const_node = - graph->AddNode(out_name + "/pad_value", pad_value); - pad2d_node->set_input_constant_values(*pad_value_const_node); - pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32 - pad2d_node->set_attr_mode(0); + auto pad_value_node = graph->Add(out_name + "/pad_value", pad_value); + pad2d_op->set_input_constant_values(*pad_value_node->data()); + pad2d_op->set_attr_T(0); // type of pad_value: 0:float 3:int32 + pad2d_op->set_attr_mode(0); } else if (mode == "reflect") { LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; - pad2d_node->set_attr_mode(1); + pad2d_op->set_attr_mode(1); return FAILED; } else { LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; @@ -87,6 +86,6 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - pad2d, +REGISTER_SUBGRAPH_BRIDGE(pad2d, + kNPU, paddle::lite::subgraph::npu::Pad2dConverter); diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h old mode 100755 new mode 100644 index d6fc5353385fd86b9ae682769684720f4e0ad57c..a63a0d889d4792bf95e9749df4f4772e3d667d5f --- a/lite/kernels/npu/bridges/paddle_use_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_bridges.h @@ -14,40 +14,42 @@ #pragma once -USE_SUBGRAPH_BRIDGE(NPU, sigmoid); -USE_SUBGRAPH_BRIDGE(NPU, relu); -USE_SUBGRAPH_BRIDGE(NPU, tanh); -USE_SUBGRAPH_BRIDGE(NPU, relu_clipped); -USE_SUBGRAPH_BRIDGE(NPU, leaky_relu); -USE_SUBGRAPH_BRIDGE(NPU, softsign); -USE_SUBGRAPH_BRIDGE(NPU, hard_sigmoid); +USE_SUBGRAPH_BRIDGE(sigmoid, kNPU); +USE_SUBGRAPH_BRIDGE(relu, kNPU); +USE_SUBGRAPH_BRIDGE(tanh, kNPU); +USE_SUBGRAPH_BRIDGE(relu_clipped, kNPU); +USE_SUBGRAPH_BRIDGE(leaky_relu, kNPU); +USE_SUBGRAPH_BRIDGE(softsign, kNPU); +USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU); -USE_SUBGRAPH_BRIDGE(NPU, batch_norm); -USE_SUBGRAPH_BRIDGE(NPU, concat); -USE_SUBGRAPH_BRIDGE(NPU, conv2d); -USE_SUBGRAPH_BRIDGE(NPU, depthwise_conv2d); -USE_SUBGRAPH_BRIDGE(NPU, conv2d_transpose); +USE_SUBGRAPH_BRIDGE(batch_norm, kNPU); +USE_SUBGRAPH_BRIDGE(concat, kNPU); +USE_SUBGRAPH_BRIDGE(conv2d, kNPU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNPU); +USE_SUBGRAPH_BRIDGE(conv2d_transpose, kNPU); -USE_SUBGRAPH_BRIDGE(NPU, elementwise_add); -USE_SUBGRAPH_BRIDGE(NPU, fusion_elementwise_add_activation); -USE_SUBGRAPH_BRIDGE(NPU, elementwise_sub); -USE_SUBGRAPH_BRIDGE(NPU, elementwise_mul); -USE_SUBGRAPH_BRIDGE(NPU, elementwise_div); +USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU); +USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU); +USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU); +USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU); +USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU); -USE_SUBGRAPH_BRIDGE(NPU, fc); -USE_SUBGRAPH_BRIDGE(NPU, bilinear_interp); -USE_SUBGRAPH_BRIDGE(NPU, nearest_interp); -USE_SUBGRAPH_BRIDGE(NPU, mul); -USE_SUBGRAPH_BRIDGE(NPU, pad2d); -USE_SUBGRAPH_BRIDGE(NPU, pool2d); -USE_SUBGRAPH_BRIDGE(NPU, reduce_mean); -USE_SUBGRAPH_BRIDGE(NPU, reshape); -USE_SUBGRAPH_BRIDGE(NPU, reshape2); -USE_SUBGRAPH_BRIDGE(NPU, scale); -USE_SUBGRAPH_BRIDGE(NPU, shuffle_channel); -USE_SUBGRAPH_BRIDGE(NPU, softmax); -USE_SUBGRAPH_BRIDGE(NPU, split); -USE_SUBGRAPH_BRIDGE(NPU, sqrt); -USE_SUBGRAPH_BRIDGE(NPU, square); -USE_SUBGRAPH_BRIDGE(NPU, transpose); -USE_SUBGRAPH_BRIDGE(NPU, transpose2); +USE_SUBGRAPH_BRIDGE(fc, kNPU); +USE_SUBGRAPH_BRIDGE(bilinear_interp, kNPU); +USE_SUBGRAPH_BRIDGE(nearest_interp, kNPU); +USE_SUBGRAPH_BRIDGE(mul, kNPU); +USE_SUBGRAPH_BRIDGE(pad2d, kNPU); +USE_SUBGRAPH_BRIDGE(pool2d, kNPU); +USE_SUBGRAPH_BRIDGE(reduce_mean, kNPU); +USE_SUBGRAPH_BRIDGE(reshape, kNPU); +USE_SUBGRAPH_BRIDGE(reshape2, kNPU); +USE_SUBGRAPH_BRIDGE(scale, kNPU); +USE_SUBGRAPH_BRIDGE(shuffle_channel, kNPU); +USE_SUBGRAPH_BRIDGE(softmax, kNPU); +USE_SUBGRAPH_BRIDGE(split, kNPU); +USE_SUBGRAPH_BRIDGE(sqrt, kNPU); +USE_SUBGRAPH_BRIDGE(square, kNPU); +USE_SUBGRAPH_BRIDGE(transpose, kNPU); +USE_SUBGRAPH_BRIDGE(transpose2, kNPU); +USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU); +USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU); diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h deleted file mode 100644 index 9a432d17e543bece48fb1c1369ee90ff56e8dcbf..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/kernels/npu/bridges/registry.h" - -USE_NPU_BRIDGE(sigmoid); -USE_NPU_BRIDGE(relu); -USE_NPU_BRIDGE(tanh); -USE_NPU_BRIDGE(relu_clipped); -USE_NPU_BRIDGE(leaky_relu); -USE_NPU_BRIDGE(softsign); -USE_NPU_BRIDGE(hard_sigmoid); - -USE_NPU_BRIDGE(batch_norm); -USE_NPU_BRIDGE(concat); -USE_NPU_BRIDGE(conv2d); -USE_NPU_BRIDGE(depthwise_conv2d); -USE_NPU_BRIDGE(conv2d_transpose); - -USE_NPU_BRIDGE(elementwise_add); -USE_NPU_BRIDGE(fusion_elementwise_add_activation); -USE_NPU_BRIDGE(elementwise_sub); -USE_NPU_BRIDGE(elementwise_mul); -USE_NPU_BRIDGE(elementwise_div); - -USE_NPU_BRIDGE(fc); -USE_NPU_BRIDGE(bilinear_interp); -USE_NPU_BRIDGE(nearest_interp); -USE_NPU_BRIDGE(mul); -USE_NPU_BRIDGE(pad2d); -USE_NPU_BRIDGE(pool2d); -USE_NPU_BRIDGE(reduce_mean); -USE_NPU_BRIDGE(reshape); -USE_NPU_BRIDGE(reshape2); -USE_NPU_BRIDGE(scale); -USE_NPU_BRIDGE(shuffle_channel); -USE_NPU_BRIDGE(softmax); -USE_NPU_BRIDGE(split); -USE_NPU_BRIDGE(sqrt); -USE_NPU_BRIDGE(square); -USE_NPU_BRIDGE(transpose); -USE_NPU_BRIDGE(transpose2); diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc index 8b108fc4ee07308c95a3a5d53173d14c8fa457d5..42349d18398b2f95003c859b15a32b707f97742a 100644 --- a/lite/kernels/npu/bridges/pool_op.cc +++ b/lite/kernels/npu/bridges/pool_op.cc @@ -48,11 +48,11 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto paddings = op_info->GetAttr>("paddings"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // pool mode @@ -109,19 +109,19 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Pooling node - auto pool_node = graph->AddNode(out_name); - pool_node->set_input_x(*x_node); - pool_node->set_attr_mode(mode); - pool_node->set_attr_pad_mode(pad_mode); - pool_node->set_attr_global_pooling(global_pooling); - pool_node->set_attr_window( - ge::AttrValue::LIST_INT(ksize.begin(), ksize.end())); - pool_node->set_attr_pad(ge::AttrValue::LIST_INT{ + auto pool_node = graph->Add(out_name); + auto pool_op = pool_node->data(); + pool_op->set_input_x(*x_node->data()); + pool_op->set_attr_mode(mode); + pool_op->set_attr_pad_mode(pad_mode); + pool_op->set_attr_global_pooling(global_pooling); + pool_op->set_attr_window(ge::AttrValue::LIST_INT(ksize.begin(), ksize.end())); + pool_op->set_attr_pad(ge::AttrValue::LIST_INT{ paddings[0], paddings[1], paddings[2], paddings[3]}); - pool_node->set_attr_stride( + pool_op->set_attr_stride( ge::AttrValue::LIST_INT(strides.begin(), strides.end())); - pool_node->set_attr_ceil_mode(ceil_mode); - // pool_node->set_attr_data_mode(data_mode); + pool_op->set_attr_ceil_mode(ceil_mode); + // pool_op->set_attr_data_mode(data_mode); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -130,6 +130,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - pool2d, +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kNPU, paddle::lite::subgraph::npu::PoolConverter); diff --git a/lite/kernels/npu/bridges/pool_op_test.cc b/lite/kernels/npu/bridges/pool_op_test.cc deleted file mode 100644 index 298e06554776e0f9efeade540d6498d1f71f8a16..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/bridges/pool_op_test.cc +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/pool_op.h" -#include -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -void pool_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto& in_dims = x->dims(); - auto& out_dims = out->dims(); - - const float* src_ptr = x->data(); - float* dst_ptr = out->mutable_data(); - - std::vector ksize = op_info->GetAttr>("ksize"); - std::vector strides = op_info->GetAttr>("strides"); - std::vector paddings = op_info->GetAttr>("paddings"); - bool exclusive = op_info->GetAttr("exclusive"); - std::string pooling_type = op_info->GetAttr("pooling_type"); - bool global_pooling = op_info->GetAttr("global_pooling"); - - int in_n = in_dims[0]; - int in_c = in_dims[1]; - int in_h = in_dims[2]; - int in_w = in_dims[3]; - int size_in_n = in_c * in_h * in_w; - int size_in_c = in_h * in_w; - - int out_h = out_dims[2]; - int out_w = out_dims[3]; - int size_out_n = in_c * out_h * out_w; - int size_out_c = out_h * out_w; - - int window_h = ksize[0]; - int window_w = ksize[1]; - int stride_h = strides[0]; - int stride_w = strides[1]; - int pad_h = paddings[0]; - int pad_w = paddings[2]; - - if (global_pooling == true) { - for (int n = 0; n < in_n; ++n) { - for (int c = 0; c < in_c; ++c) { - const float* src = src_ptr + n * size_in_n + c * size_in_c; - float res = src[0]; - if (pooling_type == "max") { - for (int i = 1; i < size_in_c; ++i) { - float cur_val = src[i]; - res = cur_val > res ? cur_val : res; - } - } else if (pooling_type == "avg") { - for (int i = 1; i < size_in_c; ++i) { - float cur_val = src[i]; - res += cur_val; - } - res /= size_in_c; - } - dst_ptr[n * size_out_n + c] = res; - } - } - } else { - for (int n = 0; n < in_n; ++n) { - for (int c = 0; c < in_c; ++c) { - for (int h = 0; h < out_h; ++h) { - int sh = h * stride_h; - int eh = sh + window_h; - sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; - eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; - for (int w = 0; w < out_w; ++w) { - int sw = w * stride_w; - int ew = sw + window_w; - sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; - ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; - int pooling_size = (ew - sw) * (eh - sh); - if (pooling_size == 0) continue; - float res = 0.f; - for (int kh = sh; kh < eh; ++kh) { - for (int kw = sw; kw < ew; ++kw) { - int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; - if (kh == sh && kw == sw) { - res = src_ptr[src_idx]; - } else { - if (pooling_type == "max") { - res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; - } - if (pooling_type == "avg") { - res += src_ptr[src_idx]; - } - } - } - } - if (pooling_type == "avg") { - if (exclusive) { - res /= pooling_size; - } else { - res /= window_h * window_w; - } - } - dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; - } - } - } - } - } -} - -void test_pool(int bs, - int ic, - int ih, - int iw, - std::string pooling_type, - bool ceil_mode, - bool global_pooling, - bool exclusive, - int ksize, - int stride, - int padding) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("pool2d"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("pooling_type", pooling_type); - opdesc.SetAttr("ksize", std::vector({ksize, ksize})); - opdesc.SetAttr("global_pooling", global_pooling); - opdesc.SetAttr("exclusive", exclusive); - opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", - std::vector({padding, padding, padding, padding})); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - pool_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, pool) { - for (auto pooling_type : {"max", "avg"}) { - for (auto ceil_mode : {true, false}) { - for (auto global_pooling : {/*true, */ false}) { - for (auto exclusive : {true /*, false*/}) { - for (auto ksize : {2, 3}) { - for (auto stride : {1, 2}) { - for (auto padding : {0, 1}) { - for (auto bs : {1, 3}) { - for (auto ic : {1, 3}) { - for (auto ih : {3, 7}) { - for (auto iw : {3, 7}) { - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - global_pooling, - exclusive, - ksize, - stride, - padding); - } - } - } - } - } - } - } - } - } - } - } - for (auto pooling_type : {"max", "avg"}) { - for (auto ceil_mode : {true, false}) { - bool global_pooling = true; - bool exclusive = true; - int ksize = 2; - int stride = 1; - int padding = 0; - int bs = 6; - int ic = 6; - int ih = 6; - int iw = 6; - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - global_pooling, - exclusive, - ksize, - stride, - padding); - } - } -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(pool2d); -USE_NPU_BRIDGE(pool2d); diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc index 6c7f29fb271956937a2c71ce79f07a4931fb9a5f..29f065675c742978638fbbb68c71dd451ca35f37 100644 --- a/lite/kernels/npu/bridges/reduce_mean_op.cc +++ b/lite/kernels/npu/bridges/reduce_mean_op.cc @@ -52,29 +52,30 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) { std::sort(dim.begin(), dim.end()); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Using ReduceSum + Scale to implement ReduceMean // Dim node - auto dim_const_node = graph->AddNode(out_name + "/dim", dim); + auto dim_node = graph->Add(out_name + "/dim", dim); // Reduce Sum node - auto reduce_sum_node = - graph->AddNode(out_name + "/reducesum"); - reduce_sum_node->set_input_x(*x_node); - reduce_sum_node->set_input_w(*dim_const_node); - reduce_sum_node->set_attr_keep_dims(keep_dim); + auto reduce_sum_node = graph->Add(out_name + "/reducesum"); + auto reduce_sum_op = reduce_sum_node->data(); + reduce_sum_op->set_input_x(*x_node->data()); + reduce_sum_op->set_input_w(*dim_node->data()); + reduce_sum_op->set_attr_keep_dims(keep_dim); // Scale node - auto scale_node = graph->AddNode(out_name); - scale_node->set_input_x(*reduce_sum_node); - scale_node->set_attr_axis(1); + auto scale_node = graph->Add(out_name); + auto scale_op = scale_node->data(); + scale_op->set_input_x(*reduce_sum_node->data()); + scale_op->set_attr_axis(1); // Add filter node(fill with scale) float scale = 1; @@ -95,9 +96,8 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) { remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag), scale_bias_shape.end()); } - auto filter_const_node = - graph->AddNode(out_name + "/filter", scale, scale_bias_shape); - scale_node->set_input_filter(*filter_const_node); + auto filter_node = graph->Add(out_name + "/filter", scale, scale_bias_shape); + scale_op->set_input_filter(*filter_node->data()); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -106,6 +106,6 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - reduce_mean, +REGISTER_SUBGRAPH_BRIDGE(reduce_mean, + kNPU, paddle::lite::subgraph::npu::ReduceMeanConverter); diff --git a/lite/kernels/npu/bridges/registry.cc b/lite/kernels/npu/bridges/registry.cc index 5f89bcb313ded16c207b305e3265a0a60cd81ecb..39181ccee97372f9952acea1e0523d4100e0843b 100644 --- a/lite/kernels/npu/bridges/registry.cc +++ b/lite/kernels/npu/bridges/registry.cc @@ -24,27 +24,27 @@ Registry& Registry::Instance() { return x; } -void Registry::Insert(const std::string& dev_type, - const std::string& op_type, +void Registry::Insert(const std::string& op_type, + const std::string& target, const cvt_func_type& cvt_func_name) { - auto it = map_.find(dev_type); + auto it = map_.find(target); if (it == map_.end()) { map_.insert(std::make_pair( - dev_type, std::unordered_map())); + target, std::unordered_map())); } - map_.at(dev_type).insert(std::make_pair(op_type, cvt_func_name)); + map_.at(target).insert(std::make_pair(op_type, cvt_func_name)); } -const cvt_func_type& Registry::Select(const std::string& dev_type, - const std::string& op_type) const { - return map_.at(dev_type).at(op_type); +const cvt_func_type& Registry::Select(const std::string& op_type, + const std::string& target) const { + return map_.at(target).at(op_type); } -bool Registry::Exists(const std::string& dev_type, - const std::string& op_type) const { - bool found = map_.find(dev_type) != map_.end(); +bool Registry::Exists(const std::string& op_type, + const std::string& target) const { + bool found = map_.find(target) != map_.end(); if (found) { - found = map_.at(dev_type).find(op_type) != map_.at(dev_type).end(); + found = map_.at(target).find(op_type) != map_.at(target).end(); } return found; } diff --git a/lite/kernels/npu/bridges/registry.h b/lite/kernels/npu/bridges/registry.h index 5198a3f8f2c0a684f15c8d7a27e5ecb3902ed43a..615a1068645e86df1fc38bdedc81ee73aad6e795 100644 --- a/lite/kernels/npu/bridges/registry.h +++ b/lite/kernels/npu/bridges/registry.h @@ -42,12 +42,12 @@ class Registry { public: static Registry& Instance(); - void Insert(const std::string& dev_type, - const std::string& op_type, + void Insert(const std::string& op_type, + const std::string& target, const cvt_func_type& cvt_func_name); - const cvt_func_type& Select(const std::string& dev_type, - const std::string& op_type) const; - bool Exists(const std::string& dev_type, const std::string& op_type) const; + const cvt_func_type& Select(const std::string& op_type, + const std::string& target) const; + bool Exists(const std::string& op_type, const std::string& target) const; Registry() = default; private: @@ -67,24 +67,24 @@ class Registry { #define UNUSED __attribute__((unused)) #endif -#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ +#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg) \ struct __test_global_namespace_##uniq_name##__ {}; \ static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ __test_global_namespace_##uniq_name##__>::value, \ msg) -#define REGISTER_SUBGRAPH_BRIDGE(dev_type, op_type, cvt_func_name) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_subgraph_bridge_##dev_type##_##op_type##__, \ +#define REGISTER_SUBGRAPH_BRIDGE(op_type__, target__, cvt_func_name) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_subgraph_bridge_##op_type__##_##target__##__, \ "REGISTER_SUBGRAPH_BRIDGE must be called in global namespace only " \ "once!"); \ - int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert() { \ + int __reg_subgraph_bridge_##op_type__##_##target__##_Insert() { \ paddle::lite::subgraph::Registry::Instance().Insert( \ - #dev_type, #op_type, cvt_func_name); \ + #op_type__, #target__, cvt_func_name); \ return 0; \ } -#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) \ - extern int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert(); \ - static int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert_return \ - UNUSED = __reg_subgraph_bridge_##dev_type##_##op_type##_Insert(); +#define USE_SUBGRAPH_BRIDGE(op_type__, target__) \ + extern int __reg_subgraph_bridge_##op_type__##_##target__##_Insert(); \ + static int __reg_subgraph_bridge_##op_type__##_##target__##_Insert_return \ + UNUSED = __reg_subgraph_bridge_##op_type__##_##target__##_Insert(); diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc index d5100dee4a415e28d5479fed9100ea2afc69ef02..50c7f9d65a94658f8670ac63e658656b982f4649 100644 --- a/lite/kernels/npu/bridges/reshape_op.cc +++ b/lite/kernels/npu/bridges/reshape_op.cc @@ -34,26 +34,25 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); auto x_type = kernel->GetInputDeclType("X"); - CHECK(x_type->precision() == PRECISION(kFloat)); - CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); auto out_type = kernel->GetOutputDeclType("Out"); - CHECK(out_type->precision() == PRECISION(kFloat)); - CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Reshape node - auto reshape_node = graph->AddNode(out_name); - reshape_node->set_input_tensor(*x_node); + auto reshape_node = graph->Add( + out_name, x_node->precision(), x_node->layout()); + auto reshape_op = reshape_node->data(); + reshape_op->set_input_tensor(*x_node->data()); // Read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr) if (HasInputArg(op_info, scope, "ShapeTensor")) { @@ -64,9 +63,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // auto actual_shape_type = kernel->GetInputDeclType("Shape"); // CHECK(actual_shape_type->precision() == PRECISION(kInt32)); // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW)); - std::shared_ptr actual_shape_node = nullptr; - if (graph->HasNode(actual_shape_name)) { - actual_shape_node = graph->GetNode(actual_shape_name); + std::shared_ptr actual_shape_node = nullptr; + if (graph->Has(actual_shape_name)) { + actual_shape_node = graph->Get(actual_shape_name); } else { auto actual_shape = scope->FindMutableTensor(actual_shape_name); auto actual_shape_dims = actual_shape->dims(); @@ -80,13 +79,13 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " "but Shape has " << out_shape.size(); + return FAILED; } - auto actual_shape_const_node = - graph->AddNode(actual_shape_name, - std::vector(out_shape.begin(), out_shape.end())); - actual_shape_node = actual_shape_const_node; + actual_shape_node = + graph->Add(actual_shape_name, + std::vector(out_shape.begin(), out_shape.end())); } - reshape_node->set_input_w(*actual_shape_node); + reshape_op->set_input_w(*actual_shape_node->data()); } else { auto shape = op_info->GetAttr>("shape"); auto out_dims = lite::operators::ValidateShape(shape, x_dims); @@ -95,33 +94,12 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " "but shape has " << out_shape.size(); + return FAILED; } - reshape_node->set_attr_shape( + reshape_op->set_attr_shape( ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); } - // XShape node - if (op_type == "reshape2") { - // Append an extra reshape node to calc XShape - std::vector xshape_dims(x_dims.size() + 1, 1); - for (size_t i = 0; i < x_dims.size(); i++) { - xshape_dims[i + 1] = x_dims[i]; - } - if (xshape_dims.size() > 4) { - LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " - "but XShape has " - << xshape_dims.size(); - return FAILED; - } - auto xshape_name = op_info->Output("XShape").front(); - // auto xshape_type = kernel->GetOutputDeclType("XShape"); - // CHECK(xshape_type->precision() == PRECISION(kFloat)); - // CHECK(xshape_type->layout() == DATALAYOUT(kNCHW)); - auto xshape_node = graph->AddNode(xshape_name); - xshape_node->set_input_tensor(*x_node); - xshape_node->set_attr_shape( - ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end())); - } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -130,9 +108,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - reshape, +REGISTER_SUBGRAPH_BRIDGE(reshape, + kNPU, paddle::lite::subgraph::npu::ReshapeConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - reshape2, +REGISTER_SUBGRAPH_BRIDGE(reshape2, + kNPU, paddle::lite::subgraph::npu::ReshapeConverter); diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc index ca04996faf82e875bbacc0119f431d48d4e1f657..d0139a9e2fd580f3143e9ad9809ed924e6e949a4 100644 --- a/lite/kernels/npu/bridges/scale_op.cc +++ b/lite/kernels/npu/bridges/scale_op.cc @@ -37,12 +37,15 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); - CHECK_GE(x_dims.size(), 2); + auto x_rank = x_dims.size(); + CHECK_GE(x_rank, 2); auto out_name = op_info->Output("Out").front(); auto out_type = kernel->GetOutputDeclType("Out"); CHECK(out_type->precision() == PRECISION(kFloat)); CHECK(out_type->layout() == DATALAYOUT(kNCHW)); - std::vector scale_bias_shape = {x_dims[1]}; + // HiAI only support [n, c, 1, 1] for the shape of scale and bias + std::vector scale_bias_shape = { + 1, x_rank < 3 ? 1 : x_dims[x_rank - 3], 1, 1}; float scale = op_info->GetAttr("scale"); float bias = op_info->GetAttr("bias"); bool bias_after_scale = op_info->GetAttr("bias_after_scale"); @@ -51,29 +54,28 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x, CvtShape(x_dims)); } // Scale node - auto scale_node = graph->AddNode(out_name); - scale_node->set_input_x(*x_node); - scale_node->set_attr_axis(1); + auto scale_node = graph->Add(out_name); + auto scale_op = scale_node->data(); + scale_op->set_input_x(*x_node->data()); + scale_op->set_attr_axis(1); // Add filter node(fill with scale) - auto filter_const_node = - graph->AddNode(out_name + "/filter", scale, scale_bias_shape); - scale_node->set_input_filter(*filter_const_node); + auto filter_node = graph->Add(out_name + "/filter", scale, scale_bias_shape); + scale_op->set_input_filter(*filter_node->data()); // Add bias node(fill with bias) if (fabs(bias) > 1e-6f) { - auto bias_const_node = - graph->AddNode(out_name + "/bias", bias, scale_bias_shape); - scale_node->set_input_bias(*bias_const_node); - scale_node->set_attr_has_bias_value(true); + auto bias_node = graph->Add(out_name + "/bias", bias, scale_bias_shape); + scale_op->set_input_bias(*bias_node->data()); + scale_op->set_attr_has_bias_value(true); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -83,6 +85,6 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - scale, +REGISTER_SUBGRAPH_BRIDGE(scale, + kNPU, paddle::lite::subgraph::npu::ScaleConverter); diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc index 47469e1506d2d49d8db2ac08c38e7b66762666a0..0552bd2382041bde155b661abc053e8680dbcd3e 100644 --- a/lite/kernels/npu/bridges/shuffle_channel_op.cc +++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc @@ -44,17 +44,19 @@ int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto group = op_info->GetAttr("group"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Shuffle Channel node - auto shuffle_channel_node = graph->AddNode(out_name); - shuffle_channel_node->set_input_x(*x_node); - shuffle_channel_node->set_attr_group(group); + auto shuffle_channel_node = graph->Add(out_name); + auto shuffle_channel_op = + shuffle_channel_node->data(); + shuffle_channel_op->set_input_x(*x_node->data()); + shuffle_channel_op->set_attr_group(group); return SUCCESS; } @@ -63,6 +65,6 @@ int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - shuffle_channel, +REGISTER_SUBGRAPH_BRIDGE(shuffle_channel, + kNPU, paddle::lite::subgraph::npu::ShuffleChannelConverter); diff --git a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc b/lite/kernels/npu/bridges/shuffle_channel_op_test.cc deleted file mode 100644 index cbf2eac9f3d4805e1b5bc4573189194f962c2d03..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/shuffle_channel_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -void shuffle_channel_ref( - const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_data = x->mutable_data(); - auto out_data = out->mutable_data(); - int group = op_info->GetAttr("group"); - auto x_dims = x->dims(); - - int n_size = x_dims.production() / x_dims[0]; - int c_size = n_size / x_dims[1]; - for (int n = 0; n < x_dims[0]; n++) { - int g_num = x_dims[1] / group; - auto tmp_out_data = out_data; - for (int g = 0; g < g_num; g++) { - auto tmp_x_data = x_data + g * c_size; - for (int i = 0; i < group; i++) { - std::memcpy(tmp_out_data, - tmp_x_data + i * g_num * c_size, - c_size * sizeof(float)); - tmp_out_data += c_size; - } - } - x_data += n_size; - out_data += n_size; - } -} - -void test_shuffle_channel(int bs, int ic, int ih, int iw, int group) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("shuffle_channel"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("group", group); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - shuffle_channel_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, softmax) { - for (auto bs : {1, 4}) { - for (auto ic : {1, 24, 35}) { - for (auto ih : {1, 4}) { - for (auto iw : {1, 4}) { - for (auto group : {1, 3, 7, 24, 35}) { - if (ic % group != 0) continue; - test_shuffle_channel(bs, ic, ih, iw, group); - } - } - } - } - } -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(shuffle_channel); -USE_NPU_BRIDGE(shuffle_channel); diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc index 01d8b0a944aeb806eb28447402629993c8e13c62..24bbb790e08b4b0ff675173af8faad3b07f8f2e0 100644 --- a/lite/kernels/npu/bridges/softmax_op.cc +++ b/lite/kernels/npu/bridges/softmax_op.cc @@ -37,29 +37,34 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto x_rank = x_dims.size(); auto out_name = op_info->Output("Out").front(); auto out_type = kernel->GetOutputDeclType("Out"); CHECK(out_type->precision() == PRECISION(kFloat)); CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto axis = op_info->GetAttr("axis"); - if (x_dims.size() > 3) { - CHECK(!(axis == 2 && x_dims[3] > 1)) - << "[NPU] Unsupported softmax params: axis = " << axis - << " :x_w = " << x_dims[3]; + if (axis < 0) { + axis += x_rank; + } + if (axis == 2 && x_rank > 3 && x_dims[3] != 1) { + LOG(WARNING) << "[NPU] Unsupported softmax params: axis = " << axis + << " :x_w = " << x_dims[3]; + return FAILED; } // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Softmax node - auto softmax_node = graph->AddNode(out_name); - softmax_node->set_input_x(*x_node); - softmax_node->set_attr_axis(axis); + auto softmax_node = graph->Add(out_name); + auto softmax_op = softmax_node->data(); + softmax_op->set_input_x(*x_node->data()); + softmax_op->set_attr_axis(axis); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -68,6 +73,6 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - softmax, +REGISTER_SUBGRAPH_BRIDGE(softmax, + kNPU, paddle::lite::subgraph::npu::SoftmaxConverter); diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc index 597de04d5bc520fab2c76218a3b625cc885a22e3..2cdf49fd540bc40ceaaa45df4a6ac65bf94f172a 100644 --- a/lite/kernels/npu/bridges/split_op.cc +++ b/lite/kernels/npu/bridges/split_op.cc @@ -47,33 +47,34 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { int64_t sections_num = static_cast(sections.size()); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Split node - auto split_node = graph->AddNode(op_type + "/" + x_name); - split_node->set_input_x(*x_node); - split_node->set_attr_axis(static_cast(axis)); + auto split_node = graph->Add(op_type + "/" + x_name); + auto split_op = split_node->data(); + split_op->set_input_x(*x_node->data()); + split_op->set_attr_axis(static_cast(axis)); if (num > 0) { - split_node->set_attr_output_num(static_cast(num)); + split_op->set_attr_output_num(static_cast(num)); } else { - split_node->set_attr_output_num(sections_num); + split_op->set_attr_output_num(sections_num); auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end()); - split_node->set_attr_size_split(size_split); + split_op->set_attr_size_split(size_split); } - split_node->create_dynamic_output_y(out_names.size()); + split_op->create_dynamic_output_y(out_names.size()); int idx = 1; for (auto& out_name : out_names) { - auto zero_const_node = - graph->AddNode(out_name + "/zero" + std::to_string(idx), 0); - auto add_node = graph->AddNode(out_name); - add_node->set_input_x1(*split_node, "y" + std::to_string(idx)); - add_node->set_input_x2(*zero_const_node); + auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0); + auto add_node = graph->Add(out_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx)); + add_op->set_input_x2(*zero_node->data()); idx++; } return REBUILD_WHEN_SHAPE_CHANGED; @@ -84,6 +85,6 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - split, +REGISTER_SUBGRAPH_BRIDGE(split, + kNPU, paddle::lite::subgraph::npu::SplitConverter); diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc index 2ee58862fb8c78ceca10ca2af3435d85e31d68fd..e8fde2272a28823763f096e087be5f024734cf1b 100644 --- a/lite/kernels/npu/bridges/sqrt_op.cc +++ b/lite/kernels/npu/bridges/sqrt_op.cc @@ -43,16 +43,17 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Sqrt node - auto sqrt_node = graph->AddNode(out_name); - sqrt_node->set_input_x(*x_node); + auto sqrt_node = graph->Add(out_name); + auto sqrt_op = sqrt_node->data(); + sqrt_op->set_input_x(*x_node->data()); return SUCCESS; } @@ -61,4 +62,6 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, sqrt, paddle::lite::subgraph::npu::SqrtConverter); +REGISTER_SUBGRAPH_BRIDGE(sqrt, + kNPU, + paddle::lite::subgraph::npu::SqrtConverter); diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc index 3f6676c8a8fa87452d3961a22f3a8d6c2f9619ad..f03c7690cb490556fe6b26a132454ca109f41310 100644 --- a/lite/kernels/npu/bridges/square_op.cc +++ b/lite/kernels/npu/bridges/square_op.cc @@ -43,16 +43,17 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Square node - auto square_node = graph->AddNode(out_name); - square_node->set_input_x(*x_node); + auto square_node = graph->Add(out_name); + auto square_op = square_node->data(); + square_op->set_input_x(*x_node->data()); return SUCCESS; } @@ -61,6 +62,6 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - square, +REGISTER_SUBGRAPH_BRIDGE(square, + kNPU, paddle::lite::subgraph::npu::SquareConverter); diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc index 70449dac7acd0622d454ac742d0c16d85c1bc954..bdac84df3ca96d14891f3636292a13252246be19 100644 --- a/lite/kernels/npu/bridges/transpose_op.cc +++ b/lite/kernels/npu/bridges/transpose_op.cc @@ -37,23 +37,24 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); - auto out_name = op_info->Input("Out").front(); + auto out_name = op_info->Output("Out").front(); auto axis = op_info->GetAttr>("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Transpose node - auto transpose_node = graph->AddNode(out_name); - transpose_node->set_input_x(*x_node); - auto w_const_node = graph->AddNode(out_name + "/w", 1.0f); - transpose_node->set_input_w(*w_const_node); - transpose_node->set_attr_order( + auto transpose_node = graph->Add(out_name); + auto transpose_op = transpose_node->data(); + transpose_op->set_input_x(*x_node->data()); + auto w_node = graph->Add(out_name + "/w", 1.0f); + transpose_op->set_input_w(*w_node->data()); + transpose_op->set_attr_order( ge::AttrValue::LIST_INT(axis.begin(), axis.end())); return SUCCESS; } @@ -63,9 +64,9 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - transpose, +REGISTER_SUBGRAPH_BRIDGE(transpose, + kNPU, paddle::lite::subgraph::npu::TransposeConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - transpose2, +REGISTER_SUBGRAPH_BRIDGE(transpose2, + kNPU, paddle::lite::subgraph::npu::TransposeConverter); diff --git a/lite/kernels/npu/bridges/transpose_op_test.cc b/lite/kernels/npu/bridges/transpose_op_test.cc deleted file mode 100644 index 9ad2610caa4f1674c1a07afd62a4b85361ec6645..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/bridges/transpose_op_test.cc +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/transpose_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -int data_index(std::vector pos, DDimLite dims) { - int d1 = dims[1]; - int d2 = dims[2]; - int d3 = dims[3]; - return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1; -} - -std::vector pos_trans(std::vector in_pos, std::vector axis) { - std::vector out_pos(in_pos.size()); - for (int i = 0; i < axis.size(); i++) { - out_pos[axis[i]] = in_pos[i]; - } - return out_pos; -} - -void transpose_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto input = - scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto output = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_dims = input->dims(); - auto y_dims = output->dims(); - auto axis = op_info->GetAttr>("axis"); - - auto* input_data = input->data(); - auto* output_data = output->mutable_data(); - - int input_n = x_dims[0]; - int input_c = x_dims[1]; - int input_h = x_dims[2]; - int input_w = x_dims[3]; - int output_n = y_dims[0]; - int output_c = y_dims[1]; - int output_h = y_dims[2]; - int output_w = y_dims[3]; - - for (int n = 0; n < input_n; ++n) { - for (int c = 0; c < input_c; ++c) { - for (int h = 0; h < input_h; ++h) { - for (int w = 0; w < input_w; ++w) { - std::vector in_pos{n, c, h, w}; - std::vector out_pos = pos_trans(in_pos, axis); - int in_index = data_index(in_pos, x_dims); - int out_index = data_index(out_pos, y_dims); - output_data[out_index] = input_data[in_index]; - } - } - } - } -} - -void test_transpose(int bs, int ic, int ih, int iw, std::vector axis) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("transpose"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("axis", axis); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - transpose_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, transpose) { -#if 0 - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto axis : {std::vector{0, 1, 2, 3}, - std::vector{0, 1, 3, 2}, - std::vector{0, 3, 1, 2}, - std::vector{1, 2, 3, 0}, - std::vector{3, 2, 1, 0}, - std::vector{2, 3, 1, 0}}) { - test_transpose(bs, ic, ih, iw, axis); - } - } - } - } - } -#endif - test_transpose(2, 3, 4, 5, std::vector{0, 1, 3, 2}); - // test_transpose(2, 3, 4, 5, std::vector{0, 1, 2, 3}); - // test_transpose(2, 2, 2, 2, std::vector{0,1,3,2}); - // test_transpose(1, 1, 2, 2, std::vector{0,1,3,2}); - // test_transpose(1, 1, 1, 2, std::vector{0,1,2,3}); -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(transpose); -USE_NPU_BRIDGE(transpose); - -USE_LITE_OP(transpose2); -USE_NPU_BRIDGE(transpose2); diff --git a/lite/kernels/npu/bridges/unsqueeze_op.cc b/lite/kernels/npu/bridges/unsqueeze_op.cc old mode 100755 new mode 100644 index 8ff95d4ed805f8e125ec0ed7f6fa7f94e02a4f91..bcb3bee83be97133cd7eebc7ae69cbc94080d74d --- a/lite/kernels/npu/bridges/unsqueeze_op.cc +++ b/lite/kernels/npu/bridges/unsqueeze_op.cc @@ -32,30 +32,30 @@ int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto x_name = op_info->Input("X").front(); auto x_type = kernel->GetInputDeclType("X"); - CHECK(x_type->precision() == PRECISION(kFloat)); CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); auto out_type = kernel->GetOutputDeclType("Out"); - CHECK(out_type->precision() == PRECISION(kFloat)); CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto out_shape = scope->FindTensor(out_name)->dims().Vectorize(); CHECK(op_info->HasAttr("axes")) << "[NPU] unsqueeze not support axes from tensor now"; // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Unsqueeze node - auto unsqueeze_node = graph->AddNode(out_name); - unsqueeze_node->set_input_tensor(*x_node); - unsqueeze_node->set_attr_shape( + auto unsqueeze_node = graph->Add(out_name); + auto unsqueeze_op = unsqueeze_node->data(); + unsqueeze_op->set_input_tensor(*x_node->data()); + unsqueeze_op->set_attr_shape( ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -65,9 +65,9 @@ int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - unsqueeze, +REGISTER_SUBGRAPH_BRIDGE(unsqueeze, + kNPU, paddle::lite::subgraph::npu::UnsqueezeConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - unsqueeze2, +REGISTER_SUBGRAPH_BRIDGE(unsqueeze2, + kNPU, paddle::lite::subgraph::npu::UnsqueezeConverter); diff --git a/lite/kernels/npu/bridges/unsqueeze_op_test.cc b/lite/kernels/npu/bridges/unsqueeze_op_test.cc deleted file mode 100755 index c59843f614c29ea9fca1dd33e6bc6fd75d4246c6..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/bridges/unsqueeze_op_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/unsqueeze_op.h" -#include -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -static DDim GetOutputShape(const std::vector& unsqz_dims, - const DDim& in_dims) { - int output_size = in_dims.size() + static_cast(unsqz_dims.size()); - int cur_output_size = in_dims.size(); - std::vector output_shape(output_size, 0); - - // Validate Check: rank range. - CHECK_LE(output_size, 6) << "The output tensor's rank should be less than 6."; - - for (int axis : unsqz_dims) { - int cur = axis < 0 ? axis + cur_output_size + 1 : axis; - // Validate Check: the axis bound - CHECK((cur >= 0) && (cur <= cur_output_size)) - << "The unsqueeze dims must be within range of current rank."; - // Move old axis, and insert new axis - for (int i = cur_output_size; i >= cur; --i) { - if (output_shape[i] == 1) { - // Move axis - output_shape[i + 1] = 1; - output_shape[i] = 0; - } - } - - output_shape[cur] = 1; - // Add the output size. - cur_output_size++; - } - - // Make output shape - for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) { - if (output_shape[out_idx] == 0) { - output_shape[out_idx] = in_dims[in_idx++]; - } - } - - return DDim(output_shape); -} - -template -void unsqueeze_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - - auto x = scope->FindTensor("x"); - auto out = scope->FindMutableTensor("out_ref"); - auto axes = op_info->GetAttr>("axes"); - auto y_dims = GetOutputShape(axes, x->dims()); - out->Resize(y_dims); - - auto x_data = x->data(); - auto out_data = out->mutable_data(); - - memcpy(out_data, x_data, x->numel() * sizeof(float)); -} - -void test_unsqueeze(const std::vector& input_shape, - std::vector axes) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.NewTensor(x_var_name); - auto* out = scope.NewTensor(out_var_name); - auto* out_ref = scope.NewTensor(out_ref_var_name); - x->Resize(input_shape); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("unsqueeze"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("axes", axes); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - - // execute reference implementation and save to output tensor - unsqueeze_ref(op); - - // compare results - CHECK_EQ(out->dims().size(), out_ref->dims().size()); - for (int i = 0; i < out->dims().size(); i++) { - CHECK_EQ(out->dims()[i], out_ref->dims()[i]); - } - - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, unsqueeze) { - test_unsqueeze({2}, {0, 2}); - test_unsqueeze({2, 3}, {1, 3}); - test_unsqueeze({1, 2, 3}, {3}); - test_unsqueeze({5, 6, 7}, {1}); -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(unsqueeze); -USE_NPU_BRIDGE(unsqueeze); diff --git a/lite/kernels/npu/bridges/utility.cc b/lite/kernels/npu/bridges/utility.cc old mode 100755 new mode 100644 index f79936c5d7b5350d96abc5617d856b595951eb71..d9c9ffae923631d20c462149a57fccf3335836fd --- a/lite/kernels/npu/bridges/utility.cc +++ b/lite/kernels/npu/bridges/utility.cc @@ -85,10 +85,26 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) { return otype; } +std::vector CvtShape(const std::vector& in_shape) { + std::vector out_shape; + // Padding the shape to 4-dimensions(NCHW) + for (int i = 0; i < 4 - in_shape.size(); i++) { + out_shape.push_back(1); + } + for (int i = 0; i < in_shape.size(); i++) { + out_shape.push_back(in_shape[i]); + } + return out_shape; +} + +std::vector CvtShape(const DDim& in_dims) { + return CvtShape(in_dims.Vectorize()); +} + ge::TensorPtr CvtTensor(const Tensor& in_tensor, std::vector out_shape, - PrecisionType in_precision, DataLayoutType in_layout) { + PrecisionType in_precision = in_tensor.precision(); auto in_size = in_tensor.dims().production(); auto in_shape = in_tensor.dims().Vectorize(); if (out_shape.empty()) { diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h old mode 100755 new mode 100644 index e8300a0472d8b672bab467fe1fbba7a2113ba254..c4721d55a0621e2b57777a9e3b860ccd12e3c6a9 --- a/lite/kernels/npu/bridges/utility.h +++ b/lite/kernels/npu/bridges/utility.h @@ -19,12 +19,12 @@ #include #include #include -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "graph/buffer.h" +#include "graph/graph.h" +#include "graph/model.h" +#include "graph/op/all_ops.h" +#include "graph/operator.h" +#include "graph/operator_reg.h" #include "lite/core/op_lite.h" #include "lite/utils/macros.h" @@ -70,59 +70,15 @@ ge::DataType CvtPrecisionType(PrecisionType itype); ge::Format CvtDataLayoutType(DataLayoutType itype); +// Padding the shape to 4-dimensions(NCHW) for HiAI +std::vector CvtShape(const std::vector& in_shape); + +std::vector CvtShape(const DDim& in_dims); + ge::TensorPtr CvtTensor(const Tensor& in_tensor, std::vector out_shape = {}, - PrecisionType in_precision = PRECISION(kFloat), DataLayoutType in_layout = DATALAYOUT(kNCHW)); -template -ge::TensorPtr CreateTensorAndFillData(const std::vector& data, - std::vector shape = {}, - ge::Format format = ge::FORMAT_NCHW) { - const std::type_info& info = typeid(T); - ge::DataType type = ge::DT_FLOAT; - if (info == typeid(float)) { - type = ge::DT_FLOAT; - } else if (info == typeid(int8_t)) { - type = ge::DT_INT8; - } else if (info == typeid(int16_t)) { - type = ge::DT_INT16; - } else if (info == typeid(int32_t)) { - type = ge::DT_INT32; - } else if (info == typeid(int64_t)) { - type = ge::DT_INT64; - } else { - LOG(FATAL) << "[NPU] Unknow value type " << info.name(); - } - if (shape.empty()) { - shape = {static_cast(data.size())}; - } else { - int size = 1; - for (auto i : shape) { - size *= i; - } - CHECK_EQ(data.size(), size); - } - ge::TensorDesc desc(ge::Shape(shape), format, type); - ge::TensorPtr tensor = std::make_shared(); - tensor->SetTensorDesc(desc); - tensor->SetData(reinterpret_cast(data.data()), - data.size() * sizeof(T)); - return tensor; -} - -template -ge::TensorPtr CreateTensorAndFillData(T value, - std::vector shape = {1}, - ge::Format format = ge::FORMAT_NCHW) { - int64_t size = 1; - for (auto i : shape) { - size *= i; - } - std::vector data(size, value); - return CreateTensorAndFillData(data, shape, format); -} - int CvtActMode(std::string act_type); } // namespace npu diff --git a/lite/kernels/npu/graph_compute.cc b/lite/kernels/npu/graph_compute.cc deleted file mode 100644 index 9a05a33062fa8f58c0f4bd96424d3fb20e457f4b..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/graph_compute.cc +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/npu/graph_compute.h" -#include -#include - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { - -void GraphCompute::PrepareForRun() { - auto& ctx = this->ctx_->template As(); - auto& param = this->Param(); - - // Load HiAI model from the weight tensor and release its buffer - // to save memory - CHECK(param.weight); - CHECK(lite::npu::LoadModel(*param.weight, &model_client_, &model_name_)); - // TODO(hong19860320): find an good way to free the model data. - // No interface exists to free the data of tensor, so I resize the dim to 1 - // and change target to force it to realloc a small size memory. - param.weight->Resize({1}); - param.weight->mutable_data(TargetType::kARM); - CHECK(model_client_); - - // Query the dimensions of NPU input and output tensors from HiAI model - std::vector npu_idims; - std::vector npu_odims; - int ret = - model_client_->GetModelIOTensorDim(model_name_, npu_idims, npu_odims); - CHECK_EQ(ret, hiai::AI_SUCCESS) - << "[NPU] Get the dimensions of input and output tensors failed."; - - // Check whether the data sizes of NPU input and output tensors are the - // same as CPU's, then create and initialize NPU input and output tensors. - npu_itensors_.resize(npu_idims.size()); - npu_otensors_.resize(npu_odims.size()); - npu_idatasizes_.resize(npu_idims.size()); - npu_odatasizes_.resize(npu_odims.size()); - for (size_t i = 0; i < npu_idims.size(); ++i) { - auto cpu_itensor = param.inputs[i].second; - CHECK(cpu_itensor); - VLOG(3) << "[NPU] CPU input dims[" << i << "]: " << cpu_itensor->dims(); - VLOG(3) << "[NPU] NPU input dims[" << i << "]: {" - << npu_idims[i].GetNumber() << "," << npu_idims[i].GetChannel() - << "," << npu_idims[i].GetHeight() << "," << npu_idims[i].GetWidth() - << "}"; - npu_idatasizes_[i] = npu_idims[i].GetNumber() * npu_idims[i].GetChannel() * - npu_idims[i].GetHeight() * npu_idims[i].GetWidth(); - CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]); - npu_itensors_[i].reset(new hiai::AiTensor); - npu_itensors_[i]->Init(&(npu_idims[i])); - } - for (size_t i = 0; i < npu_odims.size(); ++i) { - auto cpu_otensor = param.outputs[i].second; - CHECK(cpu_otensor); - VLOG(3) << "[NPU] CPU output dims[" << i << "]: " << cpu_otensor->dims(); - VLOG(3) << "[NPU] NPU output dims[" << i << "]: {" - << npu_odims[i].GetNumber() << "," << npu_odims[i].GetChannel() - << "," << npu_odims[i].GetHeight() << "," << npu_odims[i].GetWidth() - << "}"; - npu_odatasizes_[i] = npu_odims[i].GetNumber() * npu_odims[i].GetChannel() * - npu_odims[i].GetHeight() * npu_odims[i].GetWidth(); - if (cpu_otensor->dims().production() != npu_odatasizes_[i]) { - cpu_otensor->Resize({npu_odims[i].GetNumber(), - npu_odims[i].GetChannel(), - npu_odims[i].GetHeight(), - npu_odims[i].GetWidth()}); - } - npu_otensors_[i].reset(new hiai::AiTensor); - npu_otensors_[i]->Init(&(npu_odims[i])); - } -} - -void GraphCompute::Run() { - auto& param = this->Param(); - - // Check whether the data sizes of NPU input tensors are the same as - // CPU's, and copy the data of CPU input tensors to NPU's. - CHECK_EQ(param.inputs.size(), npu_itensors_.size()); - CHECK_EQ(param.outputs.size(), npu_otensors_.size()); - for (size_t i = 0; i < param.inputs.size(); ++i) { - auto cpu_itensor = param.inputs[i].second; - CHECK(cpu_itensor); - CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]); - std::memcpy(static_cast(npu_itensors_[i]->GetBuffer()), - cpu_itensor->data(), - sizeof(float) * static_cast(npu_idatasizes_[i])); - } - - // Run HiAI model with model name - std::string key = "model_name"; // Note: key seems must be model_name - model_context_.AddPara(key, model_name_); - auto GetCurrentUS = []() -> double { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; - }; - int istamp; - auto start_time = GetCurrentUS(); - CHECK_EQ(hiai::AI_SUCCESS, - model_client_->Process( - model_context_, npu_itensors_, npu_otensors_, 1000, istamp)); - VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; - - // Check whether the data sizes of NPU output tensors are the same as - // CPU's, and copy the data of NPU output tensors to CPU's. - for (size_t i = 0; i < param.outputs.size(); ++i) { - auto cpu_otensor = param.outputs[i].second; - CHECK(cpu_otensor); - CHECK_EQ(cpu_otensor->dims().production(), npu_odatasizes_[i]); - std::memcpy(cpu_otensor->mutable_data(), - static_cast(npu_otensors_[i]->GetBuffer()), - sizeof(float) * static_cast(npu_odatasizes_[i])); - } -} - -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(graph_op, - kNPU, - kFloat, - kNCHW, - paddle::lite::kernels::npu::GraphCompute, - def) - .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))}) - .Finalize(); diff --git a/lite/kernels/npu/graph_compute.h b/lite/kernels/npu/graph_compute.h deleted file mode 100644 index b289b8e42f49e347fe72c5f9f37ea80bc30fc6a2..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/graph_compute.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" -#include "lite/core/types.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { - -class GraphCompute : public KernelLite { - public: - using param_t = operators::GraphParam; - - void PrepareForRun() override; - - void Run() override; - - virtual ~GraphCompute() = default; - - private: - std::shared_ptr model_client_; - std::string model_name_; - hiai::AiContext model_context_; - - std::vector npu_idatasizes_; - std::vector npu_odatasizes_; - std::vector> npu_itensors_; - std::vector> npu_otensors_; -}; - -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc old mode 100755 new mode 100644 index d9b191950668660ae2b76b70ac2b5c12aece92c0..6f32099274f449ab51ce0f4751f99d33c3d7cd72 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -16,7 +16,7 @@ #include #include #include -#include "ai_ddk_lib/include/hiai_ir_build.h" +#include "hiai_ir_build.h" // NOLINT #include "lite/backends/npu/device.h" #include "lite/core/op_registry.h" #include "lite/kernels/npu/bridges/graph.h" @@ -39,13 +39,13 @@ int SubgraphEngine::BuildDeviceProgram() { op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); - if (!bridges.Exists("NPU", op_type)) { + if (!bridges.Exists(op_type, "kNPU")) { return subgraph::FAILED; } auto kernel = inst.kernel(); - status |= bridges.Select("NPU", op_type)(reinterpret_cast(&graph), - const_cast(op), - const_cast(kernel)); + status |= bridges.Select(op_type, "kNPU")(reinterpret_cast(&graph), + const_cast(op), + const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { return subgraph::FAILED; } @@ -57,26 +57,26 @@ int SubgraphEngine::BuildDeviceProgram() { std::vector device_inodes; std::vector device_onodes; for (auto& input_name : input_names_) { - if (graph.HasNode(input_name)) { - if (!graph.GetType(input_name).persistable()) { - device_inodes.push_back(*graph.GetNode(input_name)); + if (graph.Has(input_name)) { + if (graph.Get(input_name)->is_data()) { + device_inodes.push_back(*graph.Get(input_name)->data()); device_inames_.push_back(input_name); } else { LOG(WARNING) << "[NPU] Input node " << input_name - << " is skipped because it is a persistable node."; + << " is ignored because it is not a data node."; } } else { LOG(WARNING) << "[NPU] Input node " << input_name - << " is skipped because it does not exist."; + << " is ignored because it does not exist."; } } for (auto& output_name : output_names_) { - if (graph.HasNode(output_name)) { - device_onodes.push_back(*graph.GetNode(output_name)); + if (graph.Has(output_name)) { + device_onodes.push_back(*graph.Get(output_name)->data()); device_onames_.push_back(output_name); } else { LOG(WARNING) << "[NPU] Output node " << output_name - << " is skipped because it does not exist."; + << " is ignored because it does not exist."; } } CHECK(!device_inames_.empty()) @@ -108,14 +108,14 @@ int SubgraphEngine::BuildDeviceProgram() { origin_otensors_.resize(device_onames_.size()); device_otensors_.resize(device_onames_.size()); for (int i = 0; i < device_inames_.size(); i++) { - auto type = graph.GetType(device_inames_[i]); - auto precision = type.precision(); - auto layout = type.layout(); + auto node = graph.Get(device_inames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); CHECK(origin_itensors_[i]); origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "[NPU] Inputs[" << i - << "] precision: " << PrecisionToStr(precision) + VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i] + << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout) << " dims: {" << device_idims[i].GetNumber() << "," << device_idims[i].GetChannel() << "," @@ -129,14 +129,14 @@ int SubgraphEngine::BuildDeviceProgram() { device_itensors_[i]->Init(&(device_idims[i])); } for (int i = 0; i < device_onames_.size(); i++) { - auto type = graph.GetType(device_onames_[i]); - auto precision = type.precision(); - auto layout = type.layout(); + auto node = graph.Get(device_onames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); CHECK(origin_otensors_[i]); origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "[NPU] Outputs[" << i - << "] precision: " << PrecisionToStr(precision) + VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i] + << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout) << " dims: {" << device_odims[i].GetNumber() << "," << device_odims[i].GetChannel() << "," diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h old mode 100755 new mode 100644 index 27b4a36cfeadf6cca328fb9c980d53c9c5e79095..2cdc4a0e62fe748a8b1d1dfb8f90c17b1d36e869 --- a/lite/kernels/npu/subgraph_compute.h +++ b/lite/kernels/npu/subgraph_compute.h @@ -17,7 +17,7 @@ #include #include #include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" +#include "HiAiModelManagerService.h" #include "lite/core/kernel.h" #include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/registry.h" diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 3423b1e920e5e7c4aaa34125303b09d943e47b62..f4d3254a7b54cfea96fc2419bd425f8328990ebe 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -14,7 +14,7 @@ add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps}) add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps}) add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) +#add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) @@ -49,12 +49,14 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) -lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc - DEPS conv2d_1x1_opencl cl_image_converter op_registry program context - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) +#lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc +# DEPS conv2d_1x1_opencl cl_image_converter op_registry program context +# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) + lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc DEPS reshape_opencl cl_image_converter op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) + lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc DEPS conv_opencl op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) diff --git a/lite/kernels/opencl/conv2d_1x1_compute.cc b/lite/kernels/opencl/conv2d_1x1_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/opencl/conv2d_1x1_compute_test.cc b/lite/kernels/opencl/conv2d_1x1_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/opencl/reshape_compute.cc b/lite/kernels/opencl/reshape_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/opencl/reshape_compute_test.cc b/lite/kernels/opencl/reshape_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/x86/fc_compute_test.cc b/lite/kernels/x86/fc_compute_test.cc deleted file mode 100644 index abc0597457b7bc8ccd5e9f760ebd28197d7a85d5..0000000000000000000000000000000000000000 --- a/lite/kernels/x86/fc_compute_test.cc +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "lite/kernels/x86/fc_compute.h" -#include -#include -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace x86 { - -TEST(fc_x86, retrive_op) { - auto fc = - KernelRegistry::Global().Create("fc"); - ASSERT_FALSE(fc.empty()); - ASSERT_TRUE(fc.front()); -} - -TEST(fc_x86, init) { - FcCompute fc; - ASSERT_EQ(fc.precision(), PRECISION(kFloat)); - ASSERT_EQ(fc.target(), TARGET(kX86)); -} - -TEST(fc_x86, run_test) { - lite::Tensor x, w, b, out; - constexpr int batch_size = 2; - std::vector x_shape{batch_size, 3}; - x.Resize(lite::DDim(x_shape)); - std::vector w_shape{3, 4}; - w.Resize(lite::DDim(w_shape)); - std::vector b_shape{1, 4}; - b.Resize(lite::DDim(b_shape)); - std::vector out_shape{1, 4}; - out.Resize(lite::DDim(out_shape)); - - auto x_data = x.mutable_data(); - auto w_data = w.mutable_data(); - auto b_data = b.mutable_data(); - auto out_data = out.mutable_data(); - - for (int64_t i = 0; i < x.dims().production(); i++) { - x_data[i] = static_cast(i); - } - for (int64_t i = 0; i < w.dims().production(); i++) { - w_data[i] = static_cast(i); - } - for (int64_t i = 0; i < b.dims().production(); i++) { - b_data[i] = static_cast(i); - } - - /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3, // - w_data, 3, 4, // - b_data, ref_data); */ - - // FcCompute fc; - FcCompute fc; - operators::FcParam param; - - param.in_num_col_dims = 1; - param.input = &x; - param.w = &w; - param.bias = &b; - param.output = &out; - param.in_mat_dims = x.dims(); - - // std::unique_ptr ctx(new KernelContext); - // ctx->As(); - fc.SetParam(param); - // fc.SetContext(std::move(ctx)); - fc.Run(); - - VLOG(3) << "output vs ref"; - for (int i = 0; i < out.dims().production(); i++) { - VLOG(3) << out_data[i]; - } - - /* for (int i = 0; i < out.dims().production(); ++i) { - EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); - }*/ -} - -} // namespace x86 -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h index bbbdb91debfd7d7b046a3eb18a535462c69e358c..ca2ddf60c5e150ba7d2712ccb2e67e444cd07010 100644 --- a/lite/kernels/x86/layer_norm_compute.h +++ b/lite/kernels/x86/layer_norm_compute.h @@ -78,7 +78,7 @@ class LayerNormCompute : public KernelLite { Scale->data(), Bias->data(), static_cast(left), - static_cast(epsilon), + epsilon, right); } diff --git a/lite/kernels/x86/relu_compute.cc b/lite/kernels/x86/relu_compute.cc deleted file mode 100644 index 684b1442540637d5aadfbdd124ca2195bd7a0ca5..0000000000000000000000000000000000000000 --- a/lite/kernels/x86/relu_compute.cc +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/x86/relu_compute.h" - -REGISTER_LITE_KERNEL(relu, - kX86, - kFloat, - kNCHW, - paddle::lite::kernels::x86::ReluCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) - .Finalize(); diff --git a/lite/kernels/x86/relu_compute.h b/lite/kernels/x86/relu_compute.h deleted file mode 100644 index b80a99302ad31182e659bf62de8ff367aadca7bc..0000000000000000000000000000000000000000 --- a/lite/kernels/x86/relu_compute.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_lite.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" -#include "lite/operators/relu_op.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace x86 { - -template -class ReluCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override { - auto& param = *param_.get_mutable(); - auto n = param.X->dims().production(); - const float* input = param.X->data(); - float* output = param.Out->mutable_data(); - for (int i = 0; i < n; i++) { - output[i] = std::max(0.f, input[i]); - } - } - - virtual ~ReluCompute() = default; -}; - -} // namespace x86 -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc index f674af84caac466cfe2b06e32360dacccd2bdf5e..e3d4588aa2aed1268a8e15f654019031a5202542 100644 --- a/lite/kernels/xpu/bridges/act_op.cc +++ b/lite/kernels/xpu/bridges/act_op.cc @@ -43,20 +43,21 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Act node if (op_type == "relu") { - graph->AddNode(out_name, graph->builder_.CreateRelu(*x_node)); + graph->Add(out_name, graph->builder_.CreateRelu(*x_node->data())); } else if (op_type == "tanh") { - graph->AddNode(out_name, graph->builder_.CreateUnaryOp("tanh", *x_node)); + graph->Add(out_name, + graph->builder_.CreateUnaryOp("tanh", *x_node->data())); } else if (op_type == "gelu") { - graph->AddNode(out_name, graph->builder_.CreateGelu(*x_node)); + graph->Add(out_name, graph->builder_.CreateGelu(*x_node->data())); } else { // TODO(hong19860320) supports more activation ops LOG(WARNING) << "[XPU] Unsupported activation type " << op_type; @@ -70,6 +71,6 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, relu, paddle::lite::subgraph::xpu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, tanh, paddle::lite::subgraph::xpu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, gelu, paddle::lite::subgraph::xpu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(relu, kXPU, paddle::lite::subgraph::xpu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(tanh, kXPU, paddle::lite::subgraph::xpu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(gelu, kXPU, paddle::lite::subgraph::xpu::ActConverter); diff --git a/lite/kernels/xpu/bridges/act_op_test.cc b/lite/kernels/xpu/bridges/act_op_test.cc deleted file mode 100644 index 1a3efab46e3c7caee08bf646a560a0ab9abcf5c7..0000000000000000000000000000000000000000 --- a/lite/kernels/xpu/bridges/act_op_test.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/xpu/bridges/registry.h" -#include "lite/kernels/xpu/bridges/test_helper.h" -#include "lite/operators/activation_ops.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -void relu_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_data = x->data(); - auto out_data = out->mutable_data(); - DDim x_dims = x->dims(); - DDim out_dims = out->dims(); - CHECK_EQ(x_dims.production(), out_dims.production()); - for (int i = 0; i < out_dims.production(); i++) { - out_data[i] = std::max(0.f, x_data[i]); - } -} - -void test_relu(int bs, int ic, int ih, int iw) { - // prepare input&output variables - Scope scope; - std::string x_var_name("x"); - std::string out_var_name("out"); - std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("relu"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - - // create and convert op to XPU model, and run it on XPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - relu_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, relu) { - for (auto bs : {1, 3}) { - for (auto ic : {3, 4}) { - for (auto ih : {2, 5}) { - for (auto iw : {5, 9}) { - VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih - << " iw: " << iw; - test_relu(bs, ic, ih, iw); - } - } - } - } -} - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(relu); -USE_XPU_BRIDGE(relu); diff --git a/lite/kernels/xpu/bridges/batch_norm_op.cc b/lite/kernels/xpu/bridges/batch_norm_op.cc index 980f241660c6cec6856d738197407dd866e36ed3..d84b9cc4f190432166575cd689e839af0d0e0b12 100644 --- a/lite/kernels/xpu/bridges/batch_norm_op.cc +++ b/lite/kernels/xpu/bridges/batch_norm_op.cc @@ -37,55 +37,61 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto scale_name = op_info->Input("Scale").front(); auto scale_type = kernel->GetInputDeclType("Scale"); CHECK(scale_type->precision() == PRECISION(kFloat)); CHECK(scale_type->layout() == DATALAYOUT(kNCHW)); auto scale = scope->FindMutableTensor(scale_name); + auto bias_name = op_info->Input("Bias").front(); auto bias_type = kernel->GetInputDeclType("Bias"); CHECK(bias_type->precision() == PRECISION(kFloat)); CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); auto bias = scope->FindMutableTensor(bias_name); + auto mean_name = op_info->Input("Mean").front(); auto mean_type = kernel->GetInputDeclType("Mean"); CHECK(mean_type->precision() == PRECISION(kFloat)); CHECK(mean_type->layout() == DATALAYOUT(kNCHW)); auto mean = scope->FindMutableTensor(mean_name); + auto variance_name = op_info->Input("Variance").front(); auto variance_type = kernel->GetInputDeclType("Variance"); CHECK(variance_type->precision() == PRECISION(kFloat)); CHECK(variance_type->layout() == DATALAYOUT(kNCHW)); auto variance = scope->FindMutableTensor(variance_name); + auto y_name = op_info->Output("Y").front(); auto y_type = kernel->GetOutputDeclType("Y"); CHECK(y_type->precision() == PRECISION(kFloat)); CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + auto epsilon = op_info->GetAttr("epsilon"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Scale, Bias, Mean, Variance node - auto scale_const_node = graph->AddNode(scale_name, *scale); - auto bias_const_node = graph->AddNode(bias_name, *bias); - auto mean_const_node = graph->AddNode(mean_name, *mean); - auto variance_const_node = graph->AddNode(variance_name, *variance); + auto scale_node = graph->Add(scale_name, *scale); + auto bias_node = graph->Add(bias_name, *bias); + auto mean_node = graph->Add(mean_name, *mean); + auto variance_node = graph->Add(variance_name, *variance); // Batch Norm node and extract the first field as the output node - auto batch_norm_node = graph->builder_.CreateBatchNorm(*x_node, - *scale_const_node, - *bias_const_node, - *mean_const_node, - *variance_const_node, + auto batch_norm_data = graph->builder_.CreateBatchNorm(*x_node->data(), + *scale_node->data(), + *bias_node->data(), + *mean_node->data(), + *variance_node->data(), 1, epsilon); - graph->AddNode(y_name, graph->builder_.GetField(batch_norm_node, 0)); + graph->Add(y_name, graph->builder_.GetField(batch_norm_data, 0)); return SUCCESS; } @@ -94,6 +100,6 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - batch_norm, +REGISTER_SUBGRAPH_BRIDGE(batch_norm, + kXPU, paddle::lite::subgraph::xpu::BatchNormConverter); diff --git a/lite/kernels/xpu/bridges/batch_norm_op_test.cc b/lite/kernels/xpu/bridges/batch_norm_op_test.cc deleted file mode 100644 index dec475530a5bb5c692946bc8d185ea81990a6408..0000000000000000000000000000000000000000 --- a/lite/kernels/xpu/bridges/batch_norm_op_test.cc +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/batch_norm_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/xpu/bridges/registry.h" -#include "lite/kernels/xpu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -template -void batch_norm_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable(); - auto bias = - scope->FindVar(op_info->Input("Bias").front())->GetMutable(); - auto scale = - scope->FindVar(op_info->Input("Scale").front())->GetMutable(); - auto mean = - scope->FindVar(op_info->Input("Mean").front())->GetMutable(); - auto variance = - scope->FindVar(op_info->Input("Variance").front())->GetMutable(); - - auto x_data = x->data(); - auto y_data = y->mutable_data(); - auto scale_data = scale->mutable_data(); - auto bias_data = bias->mutable_data(); - auto mean_data = mean->mutable_data(); - auto variance_data = variance->mutable_data(); - DDim x_dims = x->dims(); - - float epsilon = op_info->GetAttr("epsilon"); - auto data_layout = op_info->GetAttr("data_layout"); - - bool global_stats = op_info->GetAttr("use_global_stats"); - if (global_stats) { - int64_t outer_size = 0; - int64_t channel_size = 0; - int64_t inner_size = 0; - if (data_layout == "NCHW") { - outer_size = x_dims[0]; - channel_size = x_dims[1]; - inner_size = x_dims.Slice(2, x_dims.size()).production(); - } else { - LOG(FATAL) << "Unknown storage order: " << data_layout; - } - auto x_ptr = x_data; - auto y_ptr = y_data; - for (int o = 0; o < outer_size; o++) { - for (int c = 0; c < channel_size; c++) { - for (int i = 0; i < inner_size; i++) { - dtype norm_x = - (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); - *y_ptr = norm_x * scale_data[c] + bias_data[c]; - x_ptr++; - y_ptr++; - } - } - } - } -} - -void test_batch_norm(int bs, int ic, int ih, int iw, float epsilon) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - std::string scale_var_name = "scale"; - std::string bias_var_name = "bias"; - std::string mean_var_name = "mean"; - std::string variance_var_name = "variance"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* scale = scope.Var(scale_var_name)->GetMutable(); - auto* bias = scope.Var(bias_var_name)->GetMutable(); - auto* mean = scope.Var(mean_var_name)->GetMutable(); - auto* variance = scope.Var(variance_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - scale->Resize({ic}); - bias->Resize({ic}); - mean->Resize({ic}); - variance->Resize({ic}); - - // initialize input&output data - FillTensor(x); - FillTensor(scale); - FillTensor(bias); - FillTensor(mean); - // variance > 0 - FillTensor(variance, 1.f, 5.f); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("batch_norm"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetInput("Scale", {scale_var_name}); - opdesc.SetInput("Bias", {bias_var_name}); - opdesc.SetInput("Mean", {mean_var_name}); - opdesc.SetInput("Variance", {variance_var_name}); - opdesc.SetOutput("Y", {out_var_name}); - opdesc.SetAttr("is_test", 1); - opdesc.SetAttr("use_global_stats", true); - opdesc.SetAttr("epsilon", epsilon); - opdesc.SetAttr("momentum", 0.9f); - opdesc.SetAttr("data_layout", std::string("NCHW")); - - // create and convert op to XPU model, then run it on XPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - batch_norm_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, batch_norm) { - for (auto bs : {1, 3}) { - for (auto ic : {2, 3}) { - for (auto ih : {4}) { - for (auto iw : {5}) { - for (auto epsilon : {1e-5f}) { - test_batch_norm(bs, ic, ih, iw, epsilon); - } - } - } - } - } -} - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(batch_norm); -USE_XPU_BRIDGE(batch_norm); diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc index 5e9e5448a1f9bcf4d4fd17b8e8a1d5529c14c59d..fe9c598847977e87d87950c3850d3e1d074958b2 100644 --- a/lite/kernels/xpu/bridges/conv_op.cc +++ b/lite/kernels/xpu/bridges/conv_op.cc @@ -61,11 +61,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(dilations.size(), 2L); // Input node - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } if (paddings.size() == 2L) { @@ -99,7 +99,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { DDim output_dims(output_shape); // Filter node - auto filter_const_node = graph->AddNode(filter_name, *filter); + auto filter_node = graph->Add(filter_name, *filter); // Conv node auto conv_attrs = xtcl::make_node(); @@ -114,9 +114,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { conv_attrs->out_layout = ""; // conv_attrs->out_dtype = ""; auto conv_node = - graph->AddNode(output_name, - graph->builder_.CreateConv2D( - *input_node, *filter_const_node, conv_attrs)); + graph->Add(output_name, + graph->builder_.CreateConv2D( + *input_node->data(), *filter_node->data(), conv_attrs)); // Add bias node if exists bias // supports the bias nodes with the following dimensions @@ -149,30 +149,27 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " isn't supported in conv2d Op when output dimension is " << output_dims; } - std::shared_ptr bias_node = nullptr; - if (graph->HasNode(bias_name)) { - // Bias node from input node - bias_node = graph->GetNode(bias_name); + std::shared_ptr bias_node = nullptr; + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); } else { - // Bias node with const data - bias_node = graph->AddNode(bias_name, *bias, bias_shape); + bias_node = graph->Add(bias_name, *bias, bias_shape); } - std::shared_ptr add_node = nullptr; if (is_channel_bias) { - add_node = graph->AddNode( - output_name, - graph->builder_.CreateBiasAdd(*conv_node, 1, *bias_node)); + conv_node = graph->Add(output_name, + graph->builder_.CreateBiasAdd( + *conv_node->data(), 1, *bias_node->data())); } else { - add_node = graph->AddNode( - output_name, - graph->builder_.CreateBinaryOp("add", *conv_node, *bias_node)); + conv_node = + graph->Add(output_name, + graph->builder_.CreateBinaryOp( + "add", *conv_node->data(), *bias_node->data())); } - conv_node = add_node; } if (fuse_relu) { // Append relu node if fuse_relu is true - graph->AddNode(output_name, graph->builder_.CreateRelu(*conv_node)); + graph->Add(output_name, graph->builder_.CreateRelu(*conv_node->data())); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -182,9 +179,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - conv2d, +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kXPU, paddle::lite::subgraph::xpu::ConvConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, - depthwise_conv2d, +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kXPU, paddle::lite::subgraph::xpu::ConvConverter); diff --git a/lite/kernels/xpu/bridges/dropout_op.cc b/lite/kernels/xpu/bridges/dropout_op.cc old mode 100755 new mode 100644 index ae81facd536042c2e6f3bd273fe89a14938fb7bc..df869e17ff5626f03d6eb988a1687bb51c75d440 --- a/lite/kernels/xpu/bridges/dropout_op.cc +++ b/lite/kernels/xpu/bridges/dropout_op.cc @@ -46,21 +46,21 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { op_info->GetAttr("dropout_implementation"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Dropout node if (dropout_implementation == "downgrade_in_infer") { - graph->AddNode( - out_name, - graph->builder_.CreateScale(*x_node, 1.f - dropout_prob, 0.0f, false)); + graph->Add(out_name, + graph->builder_.CreateScale( + *x_node->data(), 1.f - dropout_prob, 0.0f, false)); } else if (dropout_implementation == "upscale_in_train") { - graph->AddNode(out_name, - graph->builder_.CreateScale(*x_node, 1.0f, 0.0f, false)); + graph->Add(out_name, + graph->builder_.CreateScale(*x_node->data(), 1.0f, 0.0f, false)); } else { LOG(WARNING) << "[XPU] Unsupported dropout_implementation == " << dropout_implementation << " for dropout"; @@ -74,6 +74,6 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - dropout, +REGISTER_SUBGRAPH_BRIDGE(dropout, + kXPU, paddle::lite::subgraph::xpu::DropoutConverter); diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc index 49a42c55d66fc72eb62f3c04cb53a2efbba89238..7fcae312b9776afa7e3b1cbd1bd17bd25b2e4aab 100644 --- a/lite/kernels/xpu/bridges/elementwise_ops.cc +++ b/lite/kernels/xpu/bridges/elementwise_ops.cc @@ -50,29 +50,31 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto axis = op_info->GetAttr("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Y node - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); } else { - y_node = graph->AddNode(y_name, y_dims); + y_node = graph->Add(y_name, *y); } // Elementwise node - std::shared_ptr elementwise_node = nullptr; + std::shared_ptr elt_node = nullptr; if (y_dims.size() == 1) { - elementwise_node = graph->AddNode( - out_name, graph->builder_.CreateBiasAdd(*x_node, axis, *y_node)); + elt_node = graph->Add( + out_name, + graph->builder_.CreateBiasAdd(*x_node->data(), axis, *y_node->data())); } else if (x_dims.size() == y_dims.size()) { - elementwise_node = graph->AddNode( - out_name, graph->builder_.CreateBinaryOp("add", *x_node, *y_node)); + elt_node = graph->Add(out_name, + graph->builder_.CreateBinaryOp( + "add", *x_node->data(), *y_node->data())); } else { LOG(WARNING) << "[XPU] elementwise_add only support y of one dimension, or x " @@ -88,6 +90,6 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - elementwise_add, +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kXPU, paddle::lite::subgraph::xpu::ElementwiseConverter); diff --git a/lite/kernels/xpu/bridges/gather_op.cc b/lite/kernels/xpu/bridges/gather_op.cc old mode 100755 new mode 100644 index 06d1c67b0d1419192e4c8ed6219f79a8c010a06b..845bbb8d98f5734b855178fd68880c5c901608bc --- a/lite/kernels/xpu/bridges/gather_op.cc +++ b/lite/kernels/xpu/bridges/gather_op.cc @@ -54,38 +54,42 @@ int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto out_dims = out->dims(); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Index node - std::shared_ptr index_node = nullptr; - if (graph->HasNode(index_name)) { - index_node = graph->GetNode(index_name); + std::shared_ptr index_node = nullptr; + if (graph->Has(index_name)) { + index_node = graph->Get(index_name); } else { - index_node = graph->AddNode( - index_name, index_dims, index_type->precision(), index_type->layout()); + index_node = graph->Add(index_name, *index); } // Flatten index node if (index_dims.size() != 1) { index_node = - graph->AddNode(index_name + "/reshape", - graph->builder_.CreateReshape(*index_node, {-1}), - index_type->precision(), - index_type->layout()); + graph->Add(index_name + "/reshape", + graph->builder_.CreateReshape(*index_node->data(), {-1}), + index_node->precision(), + index_node->layout()); } // Reshape the gather node with the inferred shape as the output node - auto gather_node = graph->AddNode( - out_name, - graph->builder_.CreateGather(*x_node, *index_node, /* axis= */ 0)); + auto gather_node = + graph->Add(out_name, + graph->builder_.CreateGather( + *x_node->data(), *index_node->data(), /* axis= */ 0), + x_node->precision(), + x_node->layout()); if (out_dims.size() != 2) { - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *gather_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape(*gather_node->data(), + CvtShape(out_dims)), + gather_node->precision(), + gather_node->layout()); } return SUCCESS; } @@ -95,6 +99,6 @@ int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - gather, +REGISTER_SUBGRAPH_BRIDGE(gather, + kXPU, paddle::lite::subgraph::xpu::GatherConverter); diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc old mode 100755 new mode 100644 index 1691e4b0c50d6fe1d606a0d744a42a4afeae1aa8..43aaad3402b7873dbaa67d4c4897b5378e098500 --- a/lite/kernels/xpu/bridges/graph.cc +++ b/lite/kernels/xpu/bridges/graph.cc @@ -21,71 +21,70 @@ namespace lite { namespace subgraph { namespace xpu { -std::shared_ptr Graph::AddNode(const std::string& name, - const xtcl::xExpr& layer, - PrecisionType precision, - DataLayoutType layout) { - auto unique_name = [&](const std::string& key) { - int idx = 1; - auto it = counts_.find(key); - if (it == counts_.end()) { - counts_.insert(std::make_pair(key, idx)); - } else { - idx = ++(it->second); - } - return key + "_" + std::to_string(idx); - }; +int Graph::Add(const std::string& name, std::shared_ptr node) { auto it = nodes_.find(name); if (it != nodes_.end()) { - // Only variable can rebind the name - CHECK(!it->second.second.persistable()) << "[XPU] Node " << name - << " redefined."; - // Generate a new unique name as the key to bind the origin node if the - // origin node isn't a const node: new_name->node - nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second)); - nodes_.erase(it); + // Only variable node can be shared with the same name + if (!node->is_var() || !it->second.back()->is_var()) { + LOG(FATAL) << "[XPU] Const or data node " << name << " is redefined."; + return -1; + } + } else { + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; } - // Create a new node and bind with the name: name->new_node - auto node = std::make_shared(layer); - nodes_.insert(std::make_pair( - name, std::make_pair(node, Type(precision, layout, false)))); - builder_.SetLayer(unique_name(name + "_op")); - return node; + it->second.push_back(node); + return it->second.size(); } -// Const node -std::shared_ptr Graph::AddNode(const std::string& name, - const Tensor& tensor, - PrecisionType precision, - DataLayoutType layout) { - return AddNode(name, tensor, tensor.dims().Vectorize(), precision, layout); +// Variable node +std::shared_ptr Graph::Add(const std::string& name, + const xtcl::xExpr& layer, + PrecisionType precision, + DataLayoutType layout) { + auto node = std::make_shared(precision, layout, Node::Role::kVar); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + node->set_data(std::make_shared(layer)); + // Generate a unique name for the current XTCL layer + builder_.SetLayer(name + "__" + std::to_string(idx)); + return node; } -std::shared_ptr Graph::AddNode(const std::string& name, - const Tensor& tensor, - std::vector shape, - PrecisionType precision, - DataLayoutType layout) { - CHECK(!HasNode(name)) << "[NPU] Node " << name << " redefined."; - auto node = std::make_shared(builder_.CreateTensor( - name, CvtShape(shape), CvtPrecisionType(precision))); - nodes_.insert(std::make_pair( - name, std::make_pair(node, Type(precision, layout, true)))); - params_.emplace( - std::make_pair(name, *CvtTensor(tensor, shape, precision, layout))); +// Const or data node +std::shared_ptr Graph::Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout) { + std::shared_ptr node = nullptr; + PrecisionType precision = tensor.precision(); + if (tensor.persistable()) { + // Const node + node = std::make_shared(precision, layout, Node::Role::kConst); + auto idx = Add(name, node); + CHECK_EQ(idx, 1); + node->set_data(std::make_shared(builder_.CreateTensor( + name, CvtShape(shape), CvtPrecisionType(precision)))); + params_.emplace(std::make_pair(name, *CvtTensor(tensor, shape, layout))); + } else { + // Data node + node = Add(name, shape, precision, layout); + } return node; } // Data node -std::shared_ptr Graph::AddNode(const std::string& name, - std::vector shape, - PrecisionType precision, - DataLayoutType layout) { - CHECK(!HasNode(name)) << "[NPU] Node " << name << " redefined."; - auto node = std::make_shared(builder_.CreateTensor( - name, CvtShape(shape), CvtPrecisionType(precision))); - nodes_.insert(std::make_pair( - name, std::make_pair(node, Type(precision, layout, false)))); +std::shared_ptr Graph::Add(const std::string& name, + std::vector shape, + PrecisionType precision, + DataLayoutType layout) { + auto node = std::make_shared(precision, layout, Node::Role::kData); + auto idx = Add(name, node); + CHECK_EQ(idx, 1); + node->set_data(std::make_shared(builder_.CreateTensor( + name, CvtShape(shape), CvtPrecisionType(precision)))); return node; } diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h old mode 100755 new mode 100644 index 3107346851037a5e0ed4b8d709de836dc582b8b8..dafd8d853210278220b79fdf58895484cbd89ec0 --- a/lite/kernels/xpu/bridges/graph.h +++ b/lite/kernels/xpu/bridges/graph.h @@ -28,78 +28,78 @@ namespace lite { namespace subgraph { namespace xpu { -// Type of graph nodes -class Type { +// Graph and node is defined to collect all of converted XTCL IR nodes +class Node { public: - Type(PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW), - bool persistable = false) - : precision_(precision), layout_(layout), persistable_(persistable) {} - + enum class Role { + kVar = 0, + kConst, + kData, + }; + + Node(std::shared_ptr data, + PrecisionType precision, + DataLayoutType layout, + Role role) + : data_(data), precision_(precision), layout_(layout), role_(role) {} + Node(PrecisionType precision, DataLayoutType layout, Role role) + : precision_(precision), layout_(layout), role_(role) {} + + void set_data(std::shared_ptr data) { data_ = data; } void set_precision(PrecisionType precision) { precision_ = precision; } void set_layout(DataLayoutType layout) { layout_ = layout; } - void set_persistable(bool persistable) { persistable_ = persistable; } + void set_role(Role role) { role_ = role; } + std::shared_ptr data() { return data_; } PrecisionType precision() const { return precision_; } DataLayoutType layout() const { return layout_; } - bool persistable() const { return persistable_; } + Role role() const { return role_; } + bool is_var() const { return role_ == Role::kVar; } + bool is_const() const { return role_ == Role::kConst; } + bool is_data() const { return role_ == Role::kData; } private: + std::shared_ptr data_{nullptr}; PrecisionType precision_{PRECISION(kFloat)}; DataLayoutType layout_{DATALAYOUT(kNCHW)}; - bool persistable_{false}; + Role role_{Role::kVar}; }; -// Graph to collect all of converted XPU IR nodes class Graph { public: - // Layer node - std::shared_ptr AddNode( - const std::string& name, - const xtcl::xExpr& layer, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); + int Add(const std::string& name, std::shared_ptr node); + + // Variable node + std::shared_ptr Add(const std::string& name, + const xtcl::xExpr& layer, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + // Const or data node + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, tensor.dims().Vectorize(), layout); + } - // Const node - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - std::vector shape, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - DDim dims, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, tensor, dims.Vectorize(), precision, layout); + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, dims.Vectorize(), layout); } + // Const node template - std::shared_ptr AddNode( - const std::string& name, - const std::vector& data, - std::vector shape = {}, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - const std::type_info& info = typeid(T); - PrecisionType precision = PRECISION(kFloat); - if (info == typeid(float)) { - precision = PRECISION(kFloat); - } else if (info == typeid(int8_t)) { - precision = PRECISION(kFloat); - } else if (info == typeid(int32_t)) { - precision = PRECISION(kInt32); - } else { - LOG(FATAL) << "[XPU] Unknow data type " << info.name(); - } + std::shared_ptr Add(const std::string& name, + const std::vector& data, + std::vector shape = {}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { if (shape.empty()) { shape = {static_cast(data.size())}; } else { @@ -111,70 +111,61 @@ class Graph { } Tensor tensor; tensor.Resize(shape); + tensor.set_persistable(true); std::memcpy(reinterpret_cast(tensor.mutable_data()), reinterpret_cast(data.data()), data.size() * sizeof(T)); - return AddNode(name, tensor, precision, layout); + return Add(name, tensor, layout); } template - std::shared_ptr AddNode( - const std::string& name, - const std::vector& data, - DDim dims, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, data, dims.Vectorize(), layout); + std::shared_ptr Add(const std::string& name, + const std::vector& data, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, data, dims.Vectorize(), layout); } template - std::shared_ptr AddNode( - const std::string& name, - T value, - std::vector shape = {1}, - DataLayoutType layout = DATALAYOUT(kNCHW)) { + std::shared_ptr Add(const std::string& name, + T value, + std::vector shape = {1}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { int64_t size = 1; for (auto i : shape) { size *= i; } std::vector data(size, value); - return AddNode(name, data, shape, layout); + return Add(name, data, shape, layout); } template - std::shared_ptr AddNode( - const std::string& name, - T value, - DDim dims, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, value, dims.Vectorize(), layout); + std::shared_ptr Add(const std::string& name, + T value, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, value, dims.Vectorize(), layout); } // Data node - std::shared_ptr AddNode( - const std::string& name, - std::vector shape, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - DDim dims, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, dims.Vectorize(), precision, layout); - } - - std::shared_ptr GetNode(const std::string& name) { - CHECK(HasNode(name)) << "[XPU] Node " << name << " not found."; - return nodes_.at(name).first; + std::shared_ptr Add(const std::string& name, + std::vector shape, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, dims.Vectorize(), precision, layout); } - const Type& GetType(const std::string& name) { - CHECK(HasNode(name)) << "[XPU] Node " << name << " not found."; - return nodes_.at(name).second; + std::shared_ptr Get(const std::string& name) { + CHECK(Has(name)) << "[XPU] Node " << name << " not found."; + return nodes_.at(name).back(); } - bool HasNode(const std::string& name) { + bool Has(const std::string& name) { return nodes_.find(name) != nodes_.end(); } @@ -184,9 +175,7 @@ class Graph { xtcl::network::xTensorCompiler::ParamNDArrayMap params_; private: - std::unordered_map, Type>> - nodes_; - std::unordered_map counts_; + std::unordered_map>> nodes_; }; } // namespace xpu diff --git a/lite/kernels/xpu/bridges/layer_norm_op.cc b/lite/kernels/xpu/bridges/layer_norm_op.cc old mode 100755 new mode 100644 index 601dd42770e565bd638ffbc5bc4d71aff39cc721..3ad190b73f59d7f1decf01c52d24799550daaea8 --- a/lite/kernels/xpu/bridges/layer_norm_op.cc +++ b/lite/kernels/xpu/bridges/layer_norm_op.cc @@ -51,23 +51,23 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto x_inner_size = x_dims.Slice(axis, x_rank).production(); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } if (reshape) { auto reshaped_x_dims = x_dims.Slice(0, axis).Vectorize(); reshaped_x_dims.push_back(x_inner_size); - x_node = - graph->AddNode(x_name + "/reshape", - graph->builder_.CreateReshape( - *x_node, CvtShape(reshaped_x_dims))); + x_node = graph->Add( + x_name + "/reshape", + graph->builder_.CreateReshape( + *x_node->data(), CvtShape(reshaped_x_dims))); } // Scale node - std::shared_ptr scale_const_node = nullptr; + std::shared_ptr scale_node = nullptr; if (HasInputArg(op_info, scope, "Scale")) { auto scale_name = op_info->Input("Scale").front(); auto scale_type = kernel->GetInputDeclType("Scale"); @@ -77,14 +77,13 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto scale_dims = scale->dims(); CHECK_EQ(scale_dims.size(), 1); CHECK_EQ(scale_dims.production(), x_inner_size); - scale_const_node = graph->AddNode(scale_name, *scale); + scale_node = graph->Add(scale_name, *scale); } else { - scale_const_node = - graph->AddNode(y_name + "/scale_one", 1.0f, {x_inner_size}); + scale_node = graph->Add(y_name + "/scale_one", 1.0f, {x_inner_size}); } // Bias node - std::shared_ptr bias_const_node = nullptr; + std::shared_ptr bias_node = nullptr; if (HasInputArg(op_info, scope, "Bias")) { auto bias_name = op_info->Input("Bias").front(); auto bias_type = kernel->GetInputDeclType("Bias"); @@ -94,26 +93,25 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto bias_dims = bias->dims(); CHECK_EQ(bias_dims.size(), 1); CHECK_EQ(bias_dims.production(), x_inner_size); - bias_const_node = graph->AddNode(bias_name, *bias); + bias_node = graph->Add(bias_name, *bias); } else { - bias_const_node = - graph->AddNode(y_name + "/bias_zero", 0.0f, {x_inner_size}); + bias_node = graph->Add(y_name + "/bias_zero", 0.0f, {x_inner_size}); } // Layer Norm node auto layer_norm_node = - graph->AddNode(y_name, - graph->builder_.CreateLayerNorm(*x_node, - *scale_const_node, - *bias_const_node, - axis, - epsilon, - true, - true)); + graph->Add(y_name, + graph->builder_.CreateLayerNorm(*x_node->data(), + *scale_node->data(), + *bias_node->data(), + axis, + epsilon, + true, + true)); if (reshape) { - graph->AddNode(y_name, - graph->builder_.CreateReshape( - *layer_norm_node, CvtShape(y_dims))); + graph->Add(y_name, + graph->builder_.CreateReshape(*layer_norm_node->data(), + CvtShape(y_dims))); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -123,6 +121,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - layer_norm, +REGISTER_SUBGRAPH_BRIDGE(layer_norm, + kXPU, paddle::lite::subgraph::xpu::LayerNormConverter); diff --git a/lite/kernels/xpu/bridges/lookup_table_op.cc b/lite/kernels/xpu/bridges/lookup_table_op.cc old mode 100755 new mode 100644 index a03e0c2d24deb691e1de464c62ea8ef76d76ddab..eecf50b5bd601e912483adb39154a7430bc05c9e --- a/lite/kernels/xpu/bridges/lookup_table_op.cc +++ b/lite/kernels/xpu/bridges/lookup_table_op.cc @@ -57,30 +57,37 @@ int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Ids node - std::shared_ptr ids_node = nullptr; - if (graph->HasNode(ids_name)) { - ids_node = graph->GetNode(ids_name); + std::shared_ptr ids_node = nullptr; + if (graph->Has(ids_name)) { + ids_node = graph->Get(ids_name); } else { - ids_node = graph->AddNode( - ids_name, ids_dims, ids_type->precision(), ids_type->layout()); + ids_node = graph->Add(ids_name, *ids); } // Flatten Ids node if (ids_dims.size() != 1) { - ids_node = graph->AddNode(ids_name + "/reshape", - graph->builder_.CreateReshape(*ids_node, {-1}), - ids_type->precision(), - ids_type->layout()); + ids_node = + graph->Add(ids_name + "/reshape", + graph->builder_.CreateReshape(*ids_node->data(), {-1}), + ids_node->precision(), + ids_node->layout()); } - auto w_const_node = graph->AddNode(w_name, *w); + + // W node + auto w_node = graph->Add(w_name, *w); // Reshape the gather node with the inferred shape as the output node - auto gather_node = graph->AddNode( - out_name, - graph->builder_.CreateGather(*w_const_node, *ids_node, /* axis= */ 0)); + auto gather_node = + graph->Add(out_name, + graph->builder_.CreateGather( + *w_node->data(), *ids_node->data(), /* axis= */ 0), + w_node->precision(), + w_node->layout()); if (out_dims.size() != 2) { - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *gather_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape(*gather_node->data(), + CvtShape(out_dims)), + gather_node->precision(), + gather_node->layout()); } return SUCCESS; } @@ -90,6 +97,6 @@ int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - lookup_table, +REGISTER_SUBGRAPH_BRIDGE(lookup_table, + kXPU, paddle::lite::subgraph::xpu::LookupTableConverter); diff --git a/lite/kernels/xpu/bridges/matmul_op.cc b/lite/kernels/xpu/bridges/matmul_op.cc old mode 100755 new mode 100644 index 330b336840148fa54d5c9f2eae39a08fdfad9557..c17ba8423c04eddf8b042c95e959d8b703c60c7a --- a/lite/kernels/xpu/bridges/matmul_op.cc +++ b/lite/kernels/xpu/bridges/matmul_op.cc @@ -57,19 +57,19 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto alpha = op_info->GetAttr("alpha"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Y node - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); } else { - y_node = graph->AddNode(y_name, y_dims); + y_node = graph->Add(y_name, *y); } // Matmul node @@ -80,52 +80,55 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (x_dims.size() != 3) { auto m = static_cast(x_dims[x_dims.size() - 2]); auto k = static_cast(x_dims[x_dims.size() - 1]); - x_node = - graph->AddNode(x_name + "/reshape", - graph->builder_.CreateReshape(*x_node, {-1, m, k})); + x_node = graph->Add( + x_name + "/reshape", + graph->builder_.CreateReshape(*x_node->data(), {-1, m, k})); if (transpose_x) { - x_node = - graph->AddNode(x_name + "/reshape/transpose", - graph->builder_.CreateTranspose(*x_node, {0, 2, 1})); + x_node = graph->Add( + x_name + "/reshape/transpose", + graph->builder_.CreateTranspose(*x_node->data(), {0, 2, 1})); } } // Reshape and transposed Y node if (y_dims.size() != 3) { auto k = static_cast(y_dims[y_dims.size() - 2]); auto n = static_cast(y_dims[y_dims.size() - 1]); - y_node = - graph->AddNode(y_name + "/reshape", - graph->builder_.CreateReshape(*y_node, {-1, k, n})); + y_node = graph->Add( + y_name + "/reshape", + graph->builder_.CreateReshape(*y_node->data(), {-1, k, n})); if (!transpose_y) { - y_node = - graph->AddNode(y_name + "/reshape/transpose", - graph->builder_.CreateTranspose(*y_node, {0, 2, 1})); + y_node = graph->Add( + y_name + "/reshape/transpose", + graph->builder_.CreateTranspose(*y_node->data(), {0, 2, 1})); } } // Matmul node - auto matmul_node = graph->AddNode( - out_name, graph->builder_.CreateBatchMatmul(*x_node, *y_node)); + auto matmul_node = graph->Add( + out_name, + graph->builder_.CreateBatchMatmul(*x_node->data(), *y_node->data())); if (fabs(alpha - 1) > 1e-6f) { - matmul_node = graph->AddNode( - out_name, graph->builder_.CreateScale(*matmul_node, alpha)); + matmul_node = graph->Add( + out_name, graph->builder_.CreateScale(*matmul_node->data(), alpha)); } if (out_dims.size() != 3) { - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *matmul_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape( + *matmul_node->data(), CvtShape(out_dims))); } } else if (x_dims.size() == 2 && y_dims.size() == 2) { // x: [M, K], y: [K, N], out: [M, N] if (transpose_x) { - x_node = graph->AddNode(x_name + "/transpose", - graph->builder_.CreateTranspose(*x_node, {1, 0})); + x_node = + graph->Add(x_name + "/transpose", + graph->builder_.CreateTranspose(*x_node->data(), {1, 0})); } - auto matmul_node = graph->AddNode( - out_name, - graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y)); + auto matmul_node = + graph->Add(out_name, + graph->builder_.CreateMatmul2D( + *x_node->data(), *y_node->data(), transpose_y)); if (fabs(alpha - 1) > 1e-6f) { - matmul_node = graph->AddNode( - out_name, graph->builder_.CreateScale(*matmul_node, alpha)); + matmul_node = graph->Add( + out_name, graph->builder_.CreateScale(*matmul_node->data(), alpha)); } } else if (x_dims.size() == 1 && y_dims.size() == 1) { // x: [K], y: [K], out: [1] @@ -141,6 +144,6 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - matmul, +REGISTER_SUBGRAPH_BRIDGE(matmul, + kXPU, paddle::lite::subgraph::xpu::MatmulConverter); diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc index 40780557457e3ed9b99e1cec2b5bdead7f2564dd..e12f767d13e4c1e01b671f5a4f7ba712dd8a1ef5 100644 --- a/lite/kernels/xpu/bridges/mul_op.cc +++ b/lite/kernels/xpu/bridges/mul_op.cc @@ -56,49 +56,50 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(x_matrix_dims[1], y_matrix_dims[0]); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Flatten X node if (x_dims.size() != 2) { - x_node = - graph->AddNode(x_name + "/reshape", - graph->builder_.CreateReshape( - *x_node, {-1, static_cast(x_matrix_dims[1])})); + x_node = graph->Add( + x_name + "/reshape", + graph->builder_.CreateReshape( + *x_node->data(), {-1, static_cast(x_matrix_dims[1])})); } // Y node - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); } else { - y_node = graph->AddNode(y_name, y_dims); + y_node = graph->Add(y_name, *y); } // Flatten Y node if (y_dims.size() != 2) { - y_node = - graph->AddNode(y_name + "/reshape", - graph->builder_.CreateReshape( - *y_node, {static_cast(y_matrix_dims[0]), -1})); + y_node = graph->Add( + y_name + "/reshape", + graph->builder_.CreateReshape( + *y_node->data(), {static_cast(y_matrix_dims[0]), -1})); } // Reshape the matmul node with the inferred shape as the output node - auto matmul_node = graph->AddNode( - out_name, graph->builder_.CreateMatmul2D(*x_node, *y_node, false)); + auto matmul_node = graph->Add( + out_name, + graph->builder_.CreateMatmul2D(*x_node->data(), *y_node->data(), false)); if (out_dims.size() != 2) { - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *matmul_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape( + *matmul_node->data(), CvtShape(out_dims))); } return REBUILD_WHEN_SHAPE_CHANGED; -} +} // namespace xpu } // namespace xpu } // namespace subgraph } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, mul, paddle::lite::subgraph::xpu::MulConverter); +REGISTER_SUBGRAPH_BRIDGE(mul, kXPU, paddle::lite::subgraph::xpu::MulConverter); diff --git a/lite/kernels/xpu/bridges/paddle_use_bridges.h b/lite/kernels/xpu/bridges/paddle_use_bridges.h old mode 100755 new mode 100644 index 588fcdd6e4fdd0115a731ec9918b632b19052cfd..bed88034ae8c00cf2de4e747234c49283cc18c68 --- a/lite/kernels/xpu/bridges/paddle_use_bridges.h +++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h @@ -14,25 +14,25 @@ #pragma once -USE_SUBGRAPH_BRIDGE(XPU, relu); -USE_SUBGRAPH_BRIDGE(XPU, tanh); -USE_SUBGRAPH_BRIDGE(XPU, conv2d); -USE_SUBGRAPH_BRIDGE(XPU, depthwise_conv2d); -USE_SUBGRAPH_BRIDGE(XPU, elementwise_add); -USE_SUBGRAPH_BRIDGE(XPU, pool2d); -USE_SUBGRAPH_BRIDGE(XPU, softmax); -USE_SUBGRAPH_BRIDGE(XPU, mul); -USE_SUBGRAPH_BRIDGE(XPU, batch_norm); -USE_SUBGRAPH_BRIDGE(XPU, stack); -USE_SUBGRAPH_BRIDGE(XPU, gather); -USE_SUBGRAPH_BRIDGE(XPU, scale); -USE_SUBGRAPH_BRIDGE(XPU, lookup_table); -USE_SUBGRAPH_BRIDGE(XPU, slice); -USE_SUBGRAPH_BRIDGE(XPU, transpose); -USE_SUBGRAPH_BRIDGE(XPU, transpose2); -USE_SUBGRAPH_BRIDGE(XPU, reshape); -USE_SUBGRAPH_BRIDGE(XPU, reshape2); -USE_SUBGRAPH_BRIDGE(XPU, layer_norm); -USE_SUBGRAPH_BRIDGE(XPU, gelu); -USE_SUBGRAPH_BRIDGE(XPU, dropout); -USE_SUBGRAPH_BRIDGE(XPU, matmul); +USE_SUBGRAPH_BRIDGE(relu, kXPU); +USE_SUBGRAPH_BRIDGE(tanh, kXPU); +USE_SUBGRAPH_BRIDGE(conv2d, kXPU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kXPU); +USE_SUBGRAPH_BRIDGE(elementwise_add, kXPU); +USE_SUBGRAPH_BRIDGE(pool2d, kXPU); +USE_SUBGRAPH_BRIDGE(softmax, kXPU); +USE_SUBGRAPH_BRIDGE(mul, kXPU); +USE_SUBGRAPH_BRIDGE(batch_norm, kXPU); +USE_SUBGRAPH_BRIDGE(stack, kXPU); +USE_SUBGRAPH_BRIDGE(gather, kXPU); +USE_SUBGRAPH_BRIDGE(scale, kXPU); +USE_SUBGRAPH_BRIDGE(lookup_table, kXPU); +USE_SUBGRAPH_BRIDGE(slice, kXPU); +USE_SUBGRAPH_BRIDGE(transpose, kXPU); +USE_SUBGRAPH_BRIDGE(transpose2, kXPU); +USE_SUBGRAPH_BRIDGE(reshape, kXPU); +USE_SUBGRAPH_BRIDGE(reshape2, kXPU); +USE_SUBGRAPH_BRIDGE(layer_norm, kXPU); +USE_SUBGRAPH_BRIDGE(gelu, kXPU); +USE_SUBGRAPH_BRIDGE(dropout, kXPU); +USE_SUBGRAPH_BRIDGE(matmul, kXPU); diff --git a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h b/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h deleted file mode 100644 index 3c76e0e8b5cf0842cb8d5a613cef7aee3cd13bdb..0000000000000000000000000000000000000000 --- a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/kernels/xpu/bridges/registry.h" - -USE_XPU_BRIDGE(relu); -USE_XPU_BRIDGE(conv2d); -USE_XPU_BRIDGE(depthwise_conv2d); -USE_XPU_BRIDGE(elementwise_add); -USE_XPU_BRIDGE(pool2d); -USE_XPU_BRIDGE(softmax); -USE_XPU_BRIDGE(mul); -USE_XPU_BRIDGE(batch_norm); diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc index 60787a342948251eb141daced2ba3cd2931a6da9..90653edcce26dd7da5ca0848368a98ea87a04c0d 100644 --- a/lite/kernels/xpu/bridges/pool_op.cc +++ b/lite/kernels/xpu/bridges/pool_op.cc @@ -50,21 +50,22 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto exclusive = op_info->GetAttr("exclusive"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Pool node if (pooling_type == "max") { if (global_pooling) { - graph->AddNode(out_name, graph->builder_.CreateGlobalMaxPool2D(*x_node)); + graph->Add(out_name, + graph->builder_.CreateGlobalMaxPool2D(*x_node->data())); } else { - graph->AddNode( + graph->Add( out_name, - graph->builder_.CreateMaxPool2D(*x_node, + graph->builder_.CreateMaxPool2D(*x_node->data(), CvtShape(ksize), CvtShape(strides), CvtShape(paddings), @@ -73,12 +74,13 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { } } else if (pooling_type == "avg") { if (global_pooling) { - graph->AddNode(out_name, graph->builder_.CreateGlobalAvgPool2D(*x_node)); + graph->Add(out_name, + graph->builder_.CreateGlobalAvgPool2D(*x_node->data())); } else { // !exclusive ---> count_include_pad - graph->AddNode( + graph->Add( out_name, - graph->builder_.CreateAvgPool2D(*x_node, + graph->builder_.CreateAvgPool2D(*x_node->data(), CvtShape(ksize), CvtShape(strides), CvtShape(paddings), @@ -98,6 +100,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - pool2d, +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kXPU, paddle::lite::subgraph::xpu::PoolConverter); diff --git a/lite/kernels/xpu/bridges/registry.cc b/lite/kernels/xpu/bridges/registry.cc deleted file mode 100644 index 4ab1b69a25a29aeb1c1ceaff25525459ef2e94cd..0000000000000000000000000000000000000000 --- a/lite/kernels/xpu/bridges/registry.cc +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/xpu/bridges/registry.h" -#include - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -Factory& Factory::Instance() { - static Factory g_xpu_bridge; - return g_xpu_bridge; -} - -bool Factory::HasType(const std::string& op_type) const { - return map_.count(op_type); -} - -void Factory::Insert(const std::string& op_type, const func_type& func_name) { - map_.insert(std::make_pair(op_type, func_name)); -} - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/xpu/bridges/registry.h b/lite/kernels/xpu/bridges/registry.h deleted file mode 100644 index c990399c1cdeb865dc214d2f1c6d1970b6d27b85..0000000000000000000000000000000000000000 --- a/lite/kernels/xpu/bridges/registry.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "lite/core/op_lite.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -// xpu network builder and constant tensors -class graph_ctx_type { - public: - std::shared_ptr builder; - std::shared_ptr params; -}; - -// var_name, xpu node pointer -using node_map_type = - std::unordered_map>; - -using func_type = std::function, graph_ctx_type*, const node_map_type&)>; -using cvt_map_type = std::unordered_map; -class Factory { - public: - static Factory& Instance(); - - const cvt_map_type& AllFunctions() const { return map_; } - bool HasType(const std::string& op_type) const; - void Insert(const std::string& op_type, const func_type& func_name); - Factory() = default; - - private: - cvt_map_type map_; - DISALLOW_COPY_AND_ASSIGN(Factory); -}; - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -// some platform-independent defintion -#if defined(_WIN32) -#define UNUSED -#define __builtin_expect(EXP, C) (EXP) -#else -#define UNUSED __attribute__((unused)) -#endif - -#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ - struct __test_global_namespace_##uniq_name##__ {}; \ - static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ - __test_global_namespace_##uniq_name##__>::value, \ - msg) - -#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_xpu_bridge_##op_type##__, \ - "REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \ - int __reg_xpu_bridge_##op_type##_Insert() { \ - paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert( \ - #op_type, cvt_func_name); \ - return 0; \ - } - -#define USE_XPU_BRIDGE(op_type) \ - extern int __reg_xpu_bridge_##op_type##_Insert(); \ - static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \ - __reg_xpu_bridge_##op_type##_Insert(); diff --git a/lite/kernels/xpu/bridges/reshape_op.cc b/lite/kernels/xpu/bridges/reshape_op.cc old mode 100755 new mode 100644 index eeee6c7244d7686a6c07734ffcfedcf46c92c195..5e9a37d18e742e2843da1801cccc60e9202ccbcf --- a/lite/kernels/xpu/bridges/reshape_op.cc +++ b/lite/kernels/xpu/bridges/reshape_op.cc @@ -33,22 +33,16 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); - auto x_type = kernel->GetInputDeclType("X"); - CHECK(x_type->precision() == PRECISION(kFloat)); - CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); auto out_name = op_info->Output("Out").front(); - auto out_type = kernel->GetOutputDeclType("Out"); - CHECK(out_type->precision() == PRECISION(kFloat)); - CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } std::vector shape; @@ -59,6 +53,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // CHECK(shape_tensor_type->layout() == DATALAYOUT(kNCHW)); for (auto shape_tensor_name : shape_tensor_names) { auto shape_tensor = scope->FindMutableTensor(shape_tensor_name); + CHECK(shape_tensor->persistable()); auto shape_tensor_data = shape_tensor->mutable_data(); shape.emplace_back(shape_tensor_data[0]); } @@ -73,6 +68,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // CHECK(actual_shape_type->precision() == PRECISION(kInt32)); // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW)); auto actual_shape = scope->FindMutableTensor(actual_shape_name); + CHECK(actual_shape->persistable()); auto actual_shape_dims = actual_shape->dims(); auto actual_shape_data = actual_shape->mutable_data(); auto shape = std::vector( @@ -86,9 +82,11 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto out_dims = operators::ValidateShape(shape, x_dims); // Reshape node - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *x_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape(*x_node->data(), + CvtShape(out_dims)), + x_node->precision(), + x_node->layout()); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -97,9 +95,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - reshape2, +REGISTER_SUBGRAPH_BRIDGE(reshape2, + kXPU, paddle::lite::subgraph::xpu::ReshapeConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, - reshape, +REGISTER_SUBGRAPH_BRIDGE(reshape, + kXPU, paddle::lite::subgraph::xpu::ReshapeConverter); diff --git a/lite/kernels/xpu/bridges/scale_op.cc b/lite/kernels/xpu/bridges/scale_op.cc old mode 100755 new mode 100644 index a3423d290c271b9d7caf1cafdf59c5069def7a11..e6871390ac2690fa2e439ae56e59e49f342777e4 --- a/lite/kernels/xpu/bridges/scale_op.cc +++ b/lite/kernels/xpu/bridges/scale_op.cc @@ -46,17 +46,17 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { float bias = op_info->GetAttr("bias"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Scale node - graph->AddNode( - out_name, - graph->builder_.CreateScale(*x_node, scale, bias, bias_after_scale)); + graph->Add(out_name, + graph->builder_.CreateScale( + *x_node->data(), scale, bias, bias_after_scale)); return SUCCESS; } @@ -65,6 +65,6 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - scale, +REGISTER_SUBGRAPH_BRIDGE(scale, + kXPU, paddle::lite::subgraph::xpu::ScaleConverter); diff --git a/lite/kernels/xpu/bridges/slice_op.cc b/lite/kernels/xpu/bridges/slice_op.cc old mode 100755 new mode 100644 index 90c91d3b594b91c5875830e0ce468e5ab80ecc72..3e4592d454ae9b79a51606ed9108c0ef17878276 --- a/lite/kernels/xpu/bridges/slice_op.cc +++ b/lite/kernels/xpu/bridges/slice_op.cc @@ -46,11 +46,11 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto ends = op_info->GetAttr>("ends"); // Input node - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } // Calculate the begin and end of the slice in all of @@ -74,9 +74,9 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { strides.push_back(1); } } - graph->AddNode( - out_name, - graph->builder_.CreateStridedSlice(*input_node, begin, end, strides)); + graph->Add(out_name, + graph->builder_.CreateStridedSlice( + *input_node->data(), begin, end, strides)); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -85,6 +85,6 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - slice, +REGISTER_SUBGRAPH_BRIDGE(slice, + kXPU, paddle::lite::subgraph::xpu::SliceConverter); diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc index 6deb536ef17c4043e09c4b63255b585ad1abf230..d964f29a86ac00034c61706af35f8ca220921ec0 100644 --- a/lite/kernels/xpu/bridges/softmax_op.cc +++ b/lite/kernels/xpu/bridges/softmax_op.cc @@ -44,15 +44,15 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto axis = op_info->GetAttr("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Softmax node - graph->AddNode(out_name, graph->builder_.CreateSoftmax(*x_node, axis)); + graph->Add(out_name, graph->builder_.CreateSoftmax(*x_node->data(), axis)); return SUCCESS; } @@ -61,6 +61,6 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - softmax, +REGISTER_SUBGRAPH_BRIDGE(softmax, + kXPU, paddle::lite::subgraph::xpu::SoftmaxConverter); diff --git a/lite/kernels/xpu/bridges/stack_op.cc b/lite/kernels/xpu/bridges/stack_op.cc old mode 100755 new mode 100644 index eb7d6d7b79c9cdc32b62254e429903c4cc8ea6f6..69673aaebaf0a112fe5b1339b6e253a3c3a0334b --- a/lite/kernels/xpu/bridges/stack_op.cc +++ b/lite/kernels/xpu/bridges/stack_op.cc @@ -46,19 +46,19 @@ int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) { for (auto& x_name : x_names) { auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } - x_nodes.push_back(*x_node); + x_nodes.push_back(*x_node->data()); } // Stack node - graph->AddNode(y_name, - graph->builder_.CreateStack( - xtcl::network::TupleNode::make(x_nodes), axis)); + graph->Add(y_name, + graph->builder_.CreateStack( + xtcl::network::TupleNode::make(x_nodes), axis)); return SUCCESS; } @@ -67,6 +67,6 @@ int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - stack, +REGISTER_SUBGRAPH_BRIDGE(stack, + kXPU, paddle::lite::subgraph::xpu::StackConverter); diff --git a/lite/kernels/xpu/bridges/transpose_op.cc b/lite/kernels/xpu/bridges/transpose_op.cc old mode 100755 new mode 100644 index b6823dd6a83b279150603a45401c5ddee3cb9c2c..4217fe0119be8584f0ca83408dca92100e652076 --- a/lite/kernels/xpu/bridges/transpose_op.cc +++ b/lite/kernels/xpu/bridges/transpose_op.cc @@ -44,19 +44,19 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto axis = op_info->GetAttr>("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Transpose node - graph->AddNode(out_name, - graph->builder_.CreateTranspose( - *x_node, - CvtShape( - std::vector(axis.begin(), axis.end())))); + graph->Add(out_name, + graph->builder_.CreateTranspose( + *x_node->data(), + CvtShape( + std::vector(axis.begin(), axis.end())))); return SUCCESS; } @@ -66,9 +66,9 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - transpose, +REGISTER_SUBGRAPH_BRIDGE(transpose, + kXPU, paddle::lite::subgraph::xpu::TransposeConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, - transpose2, +REGISTER_SUBGRAPH_BRIDGE(transpose2, + kXPU, paddle::lite::subgraph::xpu::TransposeConverter); diff --git a/lite/kernels/xpu/bridges/utility.cc b/lite/kernels/xpu/bridges/utility.cc old mode 100755 new mode 100644 index 79fad7c8b4e0ebcae89d42c92048ed5a3cc4b825..ce28f38019baa9752cc59e4dea1b2b1d1afc9fbc --- a/lite/kernels/xpu/bridges/utility.cc +++ b/lite/kernels/xpu/bridges/utility.cc @@ -103,7 +103,7 @@ DLDeviceType CvtDLDeviceType(TargetType in_type) { out_type = kDLGPU; break; case TARGET(kXPU): - out_type = kDLCPU; + out_type = static_cast(kDLXPU); break; default: LOG(FATAL) << "[XPU] Can not convert target type(" << TargetToStr(in_type) @@ -115,8 +115,8 @@ DLDeviceType CvtDLDeviceType(TargetType in_type) { std::shared_ptr CvtTensor(const Tensor& in_tensor, std::vector out_shape, - PrecisionType in_precision, DataLayoutType in_layout) { + PrecisionType in_precision = in_tensor.precision(); auto in_shape = in_tensor.dims().Vectorize(); if (out_shape.empty()) { out_shape = in_shape; diff --git a/lite/kernels/xpu/bridges/utility.h b/lite/kernels/xpu/bridges/utility.h old mode 100755 new mode 100644 index a02a5ddff0dd2bb222d5b68c36710adc039e418a..776955854567b919234e7c79dcf6321e8e24b70a --- a/lite/kernels/xpu/bridges/utility.h +++ b/lite/kernels/xpu/bridges/utility.h @@ -58,7 +58,6 @@ xtcl::Array CvtShape(const DDim& in_dims) { std::shared_ptr CvtTensor( const Tensor& in_tensor, std::vector out_shape = {}, - PrecisionType in_precision = PRECISION(kFloat), DataLayoutType in_layout = DATALAYOUT(kNCHW)); } // namespace xpu diff --git a/lite/kernels/xpu/graph_compute.cc b/lite/kernels/xpu/graph_compute.cc deleted file mode 100644 index b9e5be1a1d5c764c378f3fdf29d73148743962a4..0000000000000000000000000000000000000000 --- a/lite/kernels/xpu/graph_compute.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/xpu/graph_compute.h" -#include -#include -#include -#include -#include "lite/backends/xpu/runtime.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { - -void GraphCompute::PrepareForRun() { - // auto& ctx = this->ctx_->template As(); - auto& param = this->Param(); - CHECK(param.weight); - CHECK(lite::xpu::LoadModel(*param.weight, &runtime_)); - CHECK(runtime_ != nullptr); -} - -void GraphCompute::Run() { - auto& param = this->Param(); - auto GetCurrentUS = []() -> double { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; - }; - auto start_time = GetCurrentUS(); - for (int i = 0; i < param.inputs.size(); i++) { - auto input_var_name = param.inputs[i].first; - auto input_tensor = param.inputs[i].second; - LOG(INFO) << "input dims[" << i << ":" << input_var_name - << "]: " << input_tensor->dims(); - auto input_tensor_data = input_tensor->data(); - for (int j = 0; j < input_tensor->dims().production(); j++) { - VLOG(3) << input_tensor_data[j]; - } - auto input_ndarray = xtcl::xNDArray::Empty( - input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0}); - auto input_ndarray_data = - static_cast(input_ndarray.ToDLPack()->dl_tensor.data); - std::memcpy(input_ndarray_data, - input_tensor_data, - sizeof(float) * input_tensor->dims().production()); - runtime_->SetInputZeroCopy(input_var_name, - &input_ndarray.ToDLPack()->dl_tensor); - } - runtime_->Run(); - for (int i = 0; i < param.outputs.size(); i++) { - auto output_ndarray = runtime_->GetOutput(i); - auto output_var_name = param.outputs[i].first; - auto output_tensor = param.outputs[i].second; - output_tensor->Resize(output_ndarray.Shape()); - LOG(INFO) << "output dims[" << i << ":" << output_var_name - << "]: " << output_tensor->dims(); - auto output_ndarray_data = - static_cast(output_ndarray.ToDLPack()->dl_tensor.data); - auto output_tensor_data = output_tensor->mutable_data(); - std::memcpy(output_tensor_data, - output_ndarray_data, - sizeof(float) * output_tensor->dims().production()); - for (int j = 0; j < output_tensor->dims().production(); j++) { - VLOG(3) << output_tensor_data[j]; - } - } - LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us"; -} - -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(graph_op, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::GraphCompute, - def) - .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))}) - .Finalize(); diff --git a/lite/kernels/xpu/graph_compute.h b/lite/kernels/xpu/graph_compute.h deleted file mode 100644 index 5406daa8a1b757989d006f4e0ea09baedc809e33..0000000000000000000000000000000000000000 --- a/lite/kernels/xpu/graph_compute.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" -#include "lite/core/types.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { - -class GraphCompute : public KernelLite { - public: - using param_t = operators::GraphParam; - - void PrepareForRun() override; - - void Run() override; - - virtual ~GraphCompute() = default; - - private: - std::shared_ptr runtime_{nullptr}; -}; - -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc old mode 100755 new mode 100644 index 07a74b045477bcdff0d60913f20e79ff8497705b..15df4f80ca20a86b3f40e46e51f1683b49e9b05f --- a/lite/kernels/xpu/subgraph_compute.cc +++ b/lite/kernels/xpu/subgraph_compute.cc @@ -39,13 +39,13 @@ int SubgraphEngine::BuildDeviceProgram() { op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); - if (!bridges.Exists("XPU", op_type)) { + if (!bridges.Exists(op_type, "kXPU")) { return subgraph::FAILED; } auto kernel = inst.kernel(); - status |= bridges.Select("XPU", op_type)(reinterpret_cast(&graph), - const_cast(op), - const_cast(kernel)); + status |= bridges.Select(op_type, "kXPU")(reinterpret_cast(&graph), + const_cast(op), + const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { return subgraph::FAILED; } @@ -57,26 +57,26 @@ int SubgraphEngine::BuildDeviceProgram() { std::vector device_inodes; std::vector device_onodes; for (auto& input_name : input_names_) { - if (graph.HasNode(input_name)) { - if (!graph.GetType(input_name).persistable()) { - device_inodes.push_back(graph.GetNode(input_name).get()); + if (graph.Has(input_name)) { + if (graph.Get(input_name)->is_data()) { + device_inodes.push_back(graph.Get(input_name)->data().get()); device_inames_.push_back(input_name); } else { LOG(WARNING) << "[XPU] Input node " << input_name - << " is skipped because it is a persistable node."; + << " is ignored because it is not a data node."; } } else { LOG(WARNING) << "[XPU] Input node " << input_name - << " is skipped because it does not exist."; + << " is ignored because it does not exist."; } } for (auto& output_name : output_names_) { - if (graph.HasNode(output_name)) { - device_onodes.push_back(graph.GetNode(output_name).get()); + if (graph.Has(output_name)) { + device_onodes.push_back(graph.Get(output_name)->data().get()); device_onames_.push_back(output_name); } else { LOG(WARNING) << "[XPU] Output node " << output_name - << " is skipped because it does not exist."; + << " is ignored because it does not exist."; } } CHECK(!device_inames_.empty()) @@ -98,14 +98,14 @@ int SubgraphEngine::BuildDeviceProgram() { origin_otensors_.resize(device_onames_.size()); device_otensors_.resize(device_onames_.size()); for (int i = 0; i < device_inames_.size(); i++) { - auto type = graph.GetType(device_inames_[i]); - auto precision = type.precision(); - auto layout = type.layout(); + auto node = graph.Get(device_inames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); CHECK(origin_itensors_[i]); origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "[XPU] Inputs[" << i - << "] precision: " << PrecisionToStr(precision) + VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i] + << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout) << " dims: " << origin_idims_[i]; // Prepare the device input tensors which share data with the origin input @@ -122,14 +122,14 @@ int SubgraphEngine::BuildDeviceProgram() { device_itensors_[i].byte_offset = 0; } for (int i = 0; i < device_onames_.size(); i++) { - auto type = graph.GetType(device_onames_[i]); - auto precision = type.precision(); - auto layout = type.layout(); + auto node = graph.Get(device_onames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); CHECK(origin_otensors_[i]); origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "[XPU] Outputs[" << i - << "] precision: " << PrecisionToStr(precision) + VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i] + << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout) << " dims: " << origin_odims_[i]; // Prepare the device output tensors which share data with the origin output @@ -175,7 +175,7 @@ int SubgraphEngine::LaunchDeviceProgram() { // Update the data pointer of DLTensor to track the origin input tensors device_itensors_[i].data = const_cast(origin_itensors_[i]->raw_data()); - device_program_->SetInputZeroCopy(device_inames_[i], &device_itensors_[i]); + device_program_->SetInput(device_inames_[i], &device_itensors_[i]); } // Run the XPU model auto GetCurrentUS = []() -> double { diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h old mode 100755 new mode 100644 diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h index 717dd3c5a6b0c48d6a1f2ae0d7dba9f08a6d99f3..9be2be954328e757e79a880f34b49c3f0cf77c7a 100644 --- a/lite/model_parser/naive_buffer/naive_buffer.h +++ b/lite/model_parser/naive_buffer/naive_buffer.h @@ -128,19 +128,23 @@ using Float64Builder = PrimaryBuilder; template class PrimaryListBuilder : public FieldBuilder { - std::vector data_; + const Primary* data_{nullptr}; + int size_{0}; public: using value_type = Primary; explicit PrimaryListBuilder(BinaryTable* table) : FieldBuilder(table) {} - PrimaryListBuilder(BinaryTable* table, const std::vector& val) - : FieldBuilder(table), data_(val) {} + PrimaryListBuilder(BinaryTable* table, const Primary* val, int size) + : FieldBuilder(table), data_(val), size_(size) {} /// Set data. - void set(const std::vector& x) { data_ = x; } + void set(const Primary* x, int size) { + data_ = x; + size_ = size; + } - const std::vector& data() const { return data_; } + const Primary* data() const { return data_; } /// Save information to the corresponding BinaryTable. void Save() override; @@ -149,14 +153,12 @@ class PrimaryListBuilder : public FieldBuilder { void Load() override; /// Number of elements. - size_t size() const { return data_.size(); } + size_t size() const { return size_; } - Type type() const override { - return core::StdTypeToRepr>(); - } + Type type() const override { return core::StdTypeToRepr(); } /// clear builder - void Clear() { data_.clear(); } + void Clear() { size_ = 0; } ~PrimaryListBuilder() = default; }; @@ -381,17 +383,14 @@ void PrimaryBuilder::Load() { template void PrimaryListBuilder::Load() { - CHECK(data_.empty()) << "Duplicate load"; + CHECK(data_ == nullptr) << "Duplicate load"; // Load number of elements first. uint64_t num_elems{}; memcpy(&num_elems, table()->cursor(), sizeof(uint64_t)); table()->Consume(sizeof(uint64_t)); - data_.resize(num_elems); - for (uint64_t i = 0; i < num_elems; i++) { - memcpy(&data_[i], table()->cursor(), sizeof(value_type)); - table()->Consume(sizeof(value_type)); - } + set(reinterpret_cast(table()->cursor()), num_elems); + table()->Consume(num_elems * sizeof(value_type)); } template @@ -404,7 +403,7 @@ void PrimaryListBuilder::Save() { table()->Require(num_elems * sizeof(value_type)); memcpy(table()->cursor(), - reinterpret_cast(&data_[0]), + reinterpret_cast(data_), num_elems * sizeof(value_type)); table()->Consume(num_elems * sizeof(value_type)); } diff --git a/lite/model_parser/naive_buffer/param_desc.cc b/lite/model_parser/naive_buffer/param_desc.cc index 4397b3c413e8a09d2e5e5b41b8f9222bcfab4e20..cc97b027160706b9c848a7b0dced22ab0fbed57a 100644 --- a/lite/model_parser/naive_buffer/param_desc.cc +++ b/lite/model_parser/naive_buffer/param_desc.cc @@ -150,9 +150,9 @@ void ParamDesc::SetDim(const std::vector& dim) { << "Data Type mismatch"; \ std::vector res; \ auto& data_builder = desc_->GetField>("data"); \ - auto& data = data_builder.data(); \ - size_t size = data.size() / sizeof(T); \ - auto* data_ptr = reinterpret_cast(&data[0]); \ + auto data = data_builder.data(); \ + size_t size = data_builder.size() / sizeof(T); \ + auto* data_ptr = reinterpret_cast(data); \ for (size_t i = 0; i < size; ++i) { \ res.push_back(data_ptr[i]); \ } \ @@ -178,8 +178,7 @@ GET_DATA_IMPL(double, FP64); data_builder->Clear(); \ size_t size = size__ * sizeof(T); \ auto* data_ptr = reinterpret_cast(data_ptr__); \ - std::vector data_vec(data_ptr, data_ptr + size); \ - data_builder->set(data_vec); + data_builder->set(data_ptr, size); #define SET_DATA_IMPL(T, type__) \ template <> \ diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 190cf7194c19a47f377755a9e9b61d890bc1a262..f307cb66acb5b34fea63a42646fc00ca957264bb 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -50,6 +50,7 @@ add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS}) add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS}) add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS}) +add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS}) # 2.basic ops not used in basic models add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) @@ -78,11 +79,9 @@ add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEP add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS}) add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS}) add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS}) -add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS}) add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS}) add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS}) add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS}) - add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS}) add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS}) add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS}) diff --git a/lite/operators/collect_fpn_proposals_op.cc b/lite/operators/collect_fpn_proposals_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/collect_fpn_proposals_op.h b/lite/operators/collect_fpn_proposals_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc index 3210520cd5d71f239da258955df0e917e5e1153e..aa500ba35c37cf8af17091d8d37d8fd8d1a08e0e 100644 --- a/lite/operators/compare_op.cc +++ b/lite/operators/compare_op.cc @@ -54,7 +54,7 @@ bool CompareOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { } // namespace paddle REGISTER_LITE_OP(equal, paddle::lite::operators::CompareOp); -REGISTER_LITE_OP(notequal, paddle::lite::operators::CompareOp); +REGISTER_LITE_OP(not_equal, paddle::lite::operators::CompareOp); REGISTER_LITE_OP(less_than, paddle::lite::operators::CompareOp); REGISTER_LITE_OP(less_equal, paddle::lite::operators::CompareOp); REGISTER_LITE_OP(greater_than, paddle::lite::operators::CompareOp); diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/distribute_fpn_proposals_op.cc b/lite/operators/distribute_fpn_proposals_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/distribute_fpn_proposals_op.h b/lite/operators/distribute_fpn_proposals_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/dropout_op.cc b/lite/operators/dropout_op.cc index bef089184751342545d56f6b16ed8554be775fae..03047de3b318ee2221809ee602d94f204568d723 100644 --- a/lite/operators/dropout_op.cc +++ b/lite/operators/dropout_op.cc @@ -33,7 +33,7 @@ bool DropoutOp::InferShape() const { param_.mask->Resize(x_dims); } // share LoD - // param_.output->set_lod(param_.input->lod()); + param_.output->set_lod(param_.x->lod()); return true; } diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc index ad3fcf79a3c9333d5525114e7b8f6abed4ae96d3..702950ae18db33f90073870d9cd19cd4aa7de91c 100644 --- a/lite/operators/fc_op.cc +++ b/lite/operators/fc_op.cc @@ -61,7 +61,7 @@ bool FcOpLite::InferShape() const { param_.output->Resize(lite::DDim(output_dims)); // share LoD - // param_.output->set_lod(param_.input->lod()); + param_.output->set_lod(param_.input->lod()); return true; } diff --git a/lite/operators/graph_op.cc b/lite/operators/graph_op.cc deleted file mode 100644 index 018ce264e2f18862549a4abc0444d02dcbb573ee..0000000000000000000000000000000000000000 --- a/lite/operators/graph_op.cc +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/graph_op.h" -#include -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace operators { - -bool GraphOpLite::CheckShape() const { - CHECK_GE_OR_FALSE(param_.inputs.size(), 1UL); - CHECK_GE_OR_FALSE(param_.outputs.size(), 1UL); - return true; -} - -bool GraphOpLite::InferShape() const { return CheckShape(); /* enrich me */ } - -bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { - auto inputs = op_desc.Input("Inputs"); - auto weight = op_desc.Input("Weight"); - auto outputs = op_desc.Output("Outputs"); - - for (auto var : inputs) { - CHECK(scope->FindVar(var)); - param_.inputs.push_back( - std::make_pair(var, scope->FindVar(var)->GetMutable())); - } - - param_.weight = scope->FindVar(weight.front())->GetMutable(); - CHECK(param_.weight); - - for (auto var : outputs) { - CHECK(scope->FindVar(var)); - param_.outputs.push_back( - std::make_pair(var, scope->FindVar(var)->GetMutable())); - } - - return true; -} - -} // namespace operators -} // namespace lite -} // namespace paddle - -REGISTER_LITE_OP(graph_op, paddle::lite::operators::GraphOpLite); diff --git a/lite/operators/graph_op.h b/lite/operators/graph_op.h deleted file mode 100644 index 20a7cd9b8da9a6d4e01411f9cff9e9a3aabc6ff7..0000000000000000000000000000000000000000 --- a/lite/operators/graph_op.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_lite.h" -#include "lite/core/scope.h" -#include "lite/core/tensor.h" -#include "lite/operators/op_params.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { -namespace operators { - -class GraphOpLite : public OpLite { - public: - GraphOpLite() {} - - explicit GraphOpLite(const std::string &type) : OpLite(type) {} - - bool CheckShape() const override; - - bool InferShape() const override; - - bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; - - void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } - - std::string DebugString() const override { return "graph_op"; } - - private: - mutable GraphParam param_; -}; - -} // namespace operators -} // namespace lite -} // namespace paddle diff --git a/lite/operators/grid_sampler_op.cc b/lite/operators/grid_sampler_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/grid_sampler_op.h b/lite/operators/grid_sampler_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/instance_norm_op.cc b/lite/operators/instance_norm_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/instance_norm_op.h b/lite/operators/instance_norm_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/merge_lod_tensor_op.cc b/lite/operators/merge_lod_tensor_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/merge_lod_tensor_op.h b/lite/operators/merge_lod_tensor_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/reduce_prod_op.cc b/lite/operators/reduce_prod_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/reduce_prod_op.h b/lite/operators/reduce_prod_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/sequence_pool_concat_op.cc b/lite/operators/sequence_pool_concat_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/sequence_pool_concat_op.h b/lite/operators/sequence_pool_concat_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/split_lod_tensor_op.cc b/lite/operators/split_lod_tensor_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/split_lod_tensor_op.h b/lite/operators/split_lod_tensor_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/subgraph_op.h b/lite/operators/subgraph_op.h old mode 100755 new mode 100644 diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt index 05fcc06b10ae5dc6b009ae087ce4e18f8d82e475..697c9874ef2072eedf6b654863e25e981fb6834a 100644 --- a/lite/tests/cv/CMakeLists.txt +++ b/lite/tests/cv/CMakeLists.txt @@ -1,3 +1,3 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) - lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm paddle_api_light ${lite_cv_deps} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm) endif() diff --git a/lite/tests/cv/cv_basic.h b/lite/tests/cv/cv_basic.h index 728d3167144bc6e03683b77803fb4887967eb524..92f68543bb15bdc15a8ed029f67ed33ca215361b 100644 --- a/lite/tests/cv/cv_basic.h +++ b/lite/tests/cv/cv_basic.h @@ -192,7 +192,6 @@ void nv21_bgra_basic(const uint8_t* in_data, nv2bgra(in_data, out_data, srcw, srch, 0, 1); } -/* /* 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B @@ -217,6 +216,21 @@ void bgr_gray_basic(const uint8_t* in_data, } } } +void bgra_gray_basic(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch) { + for (int i = 0; i < srch; i++) { + const uint8_t* din_ptr = in_data + i * 4 * srcw; + uint8_t* dout_ptr = out_data + i * srcw; + for (int j = 0; j < srcw; j++) { + int sum = din_ptr[0] * 15 + din_ptr[1] * 75 + din_ptr[2] * 38; + sum = sum >> 7; + *dout_ptr++ = sum; + din_ptr += 4; + } + } +} void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { @@ -228,6 +242,17 @@ void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } } +void gray_bgra_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src; + *dst++ = *src; + *dst++ = *src; + *dst++ = 255; + src++; + } + } +} // bgr2bgra, rgb2rgba void hwc3_to_hwc4_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { @@ -340,6 +365,16 @@ void image_convert_basic(const uint8_t* in_data, (srcFormat == ImageFormat::GRAY && dstFormat == ImageFormat::BGR)) { gray_bgr_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGBA && + dstFormat == ImageFormat::GRAY) || + (srcFormat == ImageFormat::BGRA && + dstFormat == ImageFormat::GRAY)) { + bgra_gray_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::GRAY && + dstFormat == ImageFormat::RGBA) || + (srcFormat == ImageFormat::GRAY && + dstFormat == ImageFormat::BGRA)) { + gray_bgra_basic(in_data, out_data, srcw, srch); } else if ((srcFormat == ImageFormat::RGBA && dstFormat == ImageFormat::RGB) || (srcFormat == ImageFormat::BGRA && @@ -525,6 +560,7 @@ void image_resize_basic(const uint8_t* in_data, int y_flag = 0; // only one line if (y_in_start < 0) { y_flag = 1; + y_in_end = 0; } float b0 = ibeta[dy * 2]; float b1 = ibeta[dy * 2 + 1]; @@ -750,6 +786,26 @@ void image_flip_basic(const uint8_t* in_data, flipxy_basic(in_data, srch, srcw, out_data, num); } } +void gray_to_tensor_basic(const uint8_t* bgr, + float* output, + int width, + int height, + float* means, + float* scales, + int num) { + int size = width * height; + float mean_val = means[0]; + float scale_val = scales[0]; + + for (int h = 0; h < height; h++) { + const uint8_t* ptr_bgr = bgr + h * width * num; + float* ptr_h = output + h * width; + for (int i = 0; i < width; i++) { + *ptr_h++ = (ptr_bgr[0] - mean_val) * scale_val; + ptr_bgr += num; + } + } +} void bgr_to_tensor_chw_basic(const uint8_t* bgr, float* output, @@ -828,5 +884,8 @@ void image_to_tensor_basic(const uint8_t* in_data, } else if (layout == LayoutType::kNHWC && (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA)) { bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 4); + } else if (srcFormat == ImageFormat::GRAY && + (layout == LayoutType::kNHWC || layout == LayoutType::kNCHW)) { + gray_to_tensor_basic(in_data, output, srcw, srch, means, scales, 1); } } diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc index eefd30f74f570f64d1b5617c9dddc836086394b1..e22e327e8b10d1237f5e07b5b0a8d95d3b19e70b 100644 --- a/lite/tests/cv/image_convert_test.cc +++ b/lite/tests/cv/image_convert_test.cc @@ -20,6 +20,7 @@ #include "lite/core/profile/timer.h" #include "lite/tests/cv/cv_basic.h" #include "lite/utils/cv/paddle_image_preprocess.h" +#include "time.h" // NOLINT DEFINE_int32(cluster, 3, "cluster id"); DEFINE_int32(threads, 1, "threads num"); @@ -28,15 +29,15 @@ DEFINE_int32(repeats, 1, "repeats times"); DEFINE_bool(basic_test, false, "do all tests"); DEFINE_bool(check_result, true, "check the result"); -DEFINE_int32(srcFormat, 0, "input image format"); -DEFINE_int32(dstFormat, 1, "output image format"); +DEFINE_int32(srcFormat, 0, "input image format RGBA"); +DEFINE_int32(dstFormat, 2, "output image format RGB"); DEFINE_int32(srch, 1920, "input height"); DEFINE_int32(srcw, 1080, "input width"); DEFINE_int32(dsth, 960, "output height"); DEFINE_int32(dstw, 540, "output width"); DEFINE_int32(angle, 90, "rotate angel"); DEFINE_int32(flip_num, 0, "flip x"); -DEFINE_int32(layout, 0, "layout nchw"); +DEFINE_int32(layout, 1, "layout nchw"); typedef paddle::lite::utils::cv::ImageFormat ImageFormat; typedef paddle::lite::utils::cv::FlipParam FlipParam; @@ -99,7 +100,7 @@ void test_img(const std::vector& cluster_id, float rotate, FlipParam flip, LayoutType layout, - int test_iter = 1) { + int test_iter = 10) { #ifdef LITE_WITH_ARM paddle::lite::DeviceInfo::Init(); #endif @@ -221,7 +222,7 @@ void test_img(const std::vector& cluster_id, float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; if (FLAGS_check_result) { - LOG(INFO) << "image convert basic compute"; + // LOG(INFO) << "image convert basic compute"; image_convert_basic(src, basic_dst, (ImageFormat)srcFormat, @@ -230,7 +231,7 @@ void test_img(const std::vector& cluster_id, srch, out_size); - LOG(INFO) << "image resize basic compute"; + // LOG(INFO) << "image resize basic compute"; image_resize_basic(basic_dst, resize_basic, (ImageFormat)dstFormat, @@ -239,7 +240,7 @@ void test_img(const std::vector& cluster_id, dstw, dsth); - LOG(INFO) << "image rotate basic compute"; + // LOG(INFO) << "image rotate basic compute"; image_rotate_basic(resize_basic, tv_out_ratote_basic, (ImageFormat)dstFormat, @@ -247,7 +248,7 @@ void test_img(const std::vector& cluster_id, dsth, rotate); - LOG(INFO) << "image flip basic compute"; + // LOG(INFO) << "image flip basic compute"; image_flip_basic(resize_basic, tv_out_flip_basic, (ImageFormat)dstFormat, @@ -255,7 +256,7 @@ void test_img(const std::vector& cluster_id, dsth, flip); - LOG(INFO) << "image to tensor basic compute"; + // LOG(INFO) << "image to tensor basic compute"; image_to_tensor_basic(resize_basic, &tensor_basic, (ImageFormat)dstFormat, @@ -267,10 +268,13 @@ void test_img(const std::vector& cluster_id, } Timer t1; + Timer t_convert; + Timer t_resize; + Timer t_flip; + Timer t_rotate; + Timer t_tensor; LOG(INFO) << "saber cv compute"; - double to = 0; - double min_time = 100000; TransParam tparam; tparam.ih = srch; tparam.iw = srcw; @@ -285,15 +289,17 @@ void test_img(const std::vector& cluster_id, ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); for (int i = 0; i < test_iter; ++i) { - t1.Reset(); t1.Start(); - LOG(INFO) << "image convert saber compute"; + // LOG(INFO) << "image convert saber compute"; + t_convert.Start(); // 方法一: image_preprocess.imageCovert(src, lite_dst); - image_preprocess.imageCovert( + image_preprocess.imageConvert( src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat); + t_convert.Stop(); - LOG(INFO) << "image resize saber compute"; + // LOG(INFO) << "image resize saber compute"; + t_resize.Start(); // 方法一:image_preprocess.imageResize(lite_dst, resize_tmp); image_preprocess.imageResize(lite_dst, resize_tmp, @@ -302,8 +308,10 @@ void test_img(const std::vector& cluster_id, srch, dstw, dsth); + t_resize.Stop(); - LOG(INFO) << "image rotate saber compute"; + // LOG(INFO) << "image rotate saber compute"; + t_rotate.Start(); // 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote); image_preprocess.imageRotate(resize_tmp, tv_out_ratote, @@ -311,13 +319,17 @@ void test_img(const std::vector& cluster_id, dstw, dsth, rotate); + t_rotate.Stop(); - LOG(INFO) << "image flip saber compute"; + // LOG(INFO) << "image flip saber compute"; + t_flip.Start(); // 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip); image_preprocess.imageFlip( resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip); + t_flip.Stop(); - LOG(INFO) << "image to tensor compute"; + // LOG(INFO) << "image to tensor compute"; + t_tensor.Start(); // 方法一: image_preprocess.image2Tensor( // resize_tmp, &dst_tensor, layout, means, scales); image_preprocess.image2Tensor(resize_tmp, @@ -328,16 +340,27 @@ void test_img(const std::vector& cluster_id, layout, means, scales); - + t_tensor.Stop(); t1.Stop(); - double tdiff = t1.LapTimes().Avg(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } } - LOG(INFO) << "image trans total time : " << to - << ", avg time : " << to / test_iter; + LOG(INFO) << "image convert avg time : " << t_convert.LapTimes().Avg() + << ", min time: " << t_convert.LapTimes().Min() + << ", max time: " << t_convert.LapTimes().Max(); + LOG(INFO) << "image resize avg time : " << t_resize.LapTimes().Avg() + << ", min time: " << t_resize.LapTimes().Min() + << ", max time: " << t_resize.LapTimes().Max(); + LOG(INFO) << "image rotate avg time : " << t_rotate.LapTimes().Avg() + << ", min time: " << t_rotate.LapTimes().Min() + << ", max time: " << t_rotate.LapTimes().Max(); + LOG(INFO) << "image flip avg time : " << t_flip.LapTimes().Avg() + << ", min time: " << t_flip.LapTimes().Min() + << ", max time: " << t_flip.LapTimes().Max(); + LOG(INFO) << "image tensor avg time : " << t_tensor.LapTimes().Avg() + << ", min time: " << t_tensor.LapTimes().Min() + << ", max time: " << t_tensor.LapTimes().Max(); + LOG(INFO) << "image trans total avg time : " << t1.LapTimes().Avg() + << ", min time: " << t1.LapTimes().Min() + << ", max time: " << t1.LapTimes().Max(); double max_ratio = 0; double max_diff = 0; @@ -536,7 +559,7 @@ void test_img(const std::vector& cluster_id, } } -#if 1 +#if 0 TEST(TestImageConvertRand, test_func_image_convert_preprocess) { if (FLAGS_basic_test) { for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { @@ -546,19 +569,16 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) { for (auto rotate : {180}) { for (auto flip : {0}) { for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) { - for (auto dstFormat : {0, 1, 2, 3}) { + for (auto dstFormat : {0, 1, 2, 3, 4}) { for (auto layout : {1}) { - if ((dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::NV12 || + if ((srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { + (dstFormat == ImageFormat::GRAY)) { + continue; + } + if ((dstFormat == ImageFormat::NV12 || + dstFormat == ImageFormat::NV21) && + (srcFormat == ImageFormat::GRAY)) { continue; } if (srcFormat == ImageFormat::NV12 || @@ -591,7 +611,7 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) { } } #endif -#if 1 +#if 0 TEST(TestImageConvertRand, test_func_image_resize_preprocess) { if (FLAGS_basic_test) { for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { @@ -601,21 +621,13 @@ TEST(TestImageConvertRand, test_func_image_resize_preprocess) { for (auto rotate : {180}) { for (auto flip : {0}) { for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) { - for (auto dstFormat : {0, 1, 2, 3}) { + for (auto dstFormat : {0, 1, 2, 3, 4, 11}) { for (auto layout : {1}) { if (dstFormat == ImageFormat::NV12 || - dstFormat == ImageFormat::NV21 || - (dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || + dstFormat == ImageFormat::NV21 || (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { + dstFormat == ImageFormat::GRAY) { continue; } if (srcFormat == ImageFormat::NV12 || @@ -656,25 +668,10 @@ TEST(TestImageConvertRand, test_func_image_trans_preprocess) { for (auto ww : {32, 112}) { for (auto hh : {112}) { for (auto rotate : {90, 180, 270}) { - for (auto flip : {0, 1, 2}) { - for (auto srcFormat : {11}) { - for (auto dstFormat : {3}) { + for (auto flip : {-1, 0, 1}) { + for (auto srcFormat : {0}) { + for (auto dstFormat : {0, 1, 2, 3, 4}) { for (auto layout : {1, 3}) { - if (dstFormat == ImageFormat::NV12 || - dstFormat == ImageFormat::NV21 || - (dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::NV12 || - srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { - continue; - } if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { if (w % 2) { // is not ou shu, two line y == one line @@ -717,7 +714,8 @@ TEST(TestImageConvertCustom, test_func_image_preprocess_custom) { (ImageFormat)FLAGS_dstFormat, FLAGS_angle, (FlipParam)FLAGS_flip_num, - (LayoutType)FLAGS_layout); + (LayoutType)FLAGS_layout, + 20); } #endif #endif diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 462237674283f96dad14406dde344bd5fbf32202..a7ae4145737a7ee6fbce61663ff068b44d6270b0 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,68 +1,70 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) + lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) if(LITE_BUILD_EXTRA) - lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() - lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/tests/kernels/batch_norm_compute_test.cc b/lite/tests/kernels/batch_norm_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ae65e0e3c320ff153a99d2a1656227bad34428d4 --- /dev/null +++ b/lite/tests/kernels/batch_norm_compute_test.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { + +class BatchNormComputeTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string op_type_ = "batch_norm"; + std::string input_ = "x"; + std::string scale_ = "scale"; + std::string bias_ = "bias"; + std::string mean_ = "mean"; + std::string variance_ = "variance"; + std::string output_ = "y"; + std::string mean_out_ = "mean_out"; + std::string saved_mean_ = "saved_mean"; + std::string variance_out_ = "variance_out"; + std::string saved_variance_ = "saved_variance"; + DDim dims_{{1, 2, 3, 4}}; + bool use_global_stats_ = false; + float momentum_ = 0.9; + float epsilon_ = 1e-5f; + std::string data_layout_ = "NCHW"; + int is_test_ = 1; + + public: + BatchNormComputeTest(const Place& place, + const std::string& alias, + DDim dims, + float epsilon) + : TestCase(place, alias), dims_(dims), epsilon_(epsilon) {} + + void RunBaseline(Scope* scope) override { + auto x = scope->FindTensor(input_); + auto scale = scope->FindTensor(scale_); + auto bias = scope->FindTensor(bias_); + auto mean = scope->FindTensor(mean_); + auto variance = scope->FindTensor(variance_); + + auto y = scope->NewTensor(output_); + auto mean_out = scope->NewTensor(mean_out_); + auto variance_out = scope->NewTensor(variance_out_); + auto saved_mean = scope->NewTensor(saved_mean_); + auto saved_variance = scope->NewTensor(saved_variance_); + CHECK(y); + CHECK(mean_out); + CHECK(variance_out); + CHECK(saved_mean); + CHECK(saved_variance); + y->Resize(dims_); + + int64_t channel_size = 0; + if (data_layout_ == "NCHW") { + channel_size = dims_[1]; + } else { + LOG(FATAL) << "Unknown storage order: " << data_layout_; + } + mean_out->Resize({channel_size}); + variance_out->Resize({channel_size}); + saved_mean->Resize({channel_size}); + saved_variance->Resize({channel_size}); + + auto x_data = x->data(); + auto y_data = y->mutable_data(); + auto scale_data = scale->data(); + auto bias_data = bias->data(); + auto mean_data = mean->data(); + auto variance_data = variance->data(); + + int64_t outer_size = 0; + int64_t inner_size = 0; + if (data_layout_ == "NCHW") { + outer_size = dims_[0]; + inner_size = dims_.Slice(2, dims_.size()).production(); + } else { + LOG(FATAL) << "Unknown storage order: " << data_layout_; + } + auto x_ptr = x_data; + auto y_ptr = y_data; + for (int o = 0; o < outer_size; o++) { + for (int c = 0; c < channel_size; c++) { + for (int i = 0; i < inner_size; i++) { + float norm_x = + (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon_); + *y_ptr = norm_x * scale_data[c] + bias_data[c]; + x_ptr++; + y_ptr++; + } + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType(op_type_); + op_desc->SetInput("X", {input_}); + op_desc->SetInput("Bias", {bias_}); + op_desc->SetInput("Scale", {scale_}); + op_desc->SetInput("Mean", {mean_}); + op_desc->SetInput("Variance", {variance_}); + op_desc->SetOutput("Y", {output_}); + op_desc->SetOutput("MeanOut", {mean_out_}); + op_desc->SetOutput("VarianceOut", {variance_out_}); + op_desc->SetOutput("SavedMean", {saved_mean_}); + op_desc->SetOutput("SavedVariance", {saved_variance_}); + op_desc->SetAttr("epsilon", epsilon_); + op_desc->SetAttr("momentum", momentum_); + op_desc->SetAttr("use_global_stats", use_global_stats_); + op_desc->SetAttr("data_layout", data_layout_); + op_desc->SetAttr("is_test", is_test_); + } + + void PrepareData() override { + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + + DDim scale_dim({dims_[1]}); + std::vector scale(scale_dim.production()); + fill_data_rand(scale.data(), -1.f, 1.f, scale_dim.production()); + + std::vector bias(scale_dim.production()); + fill_data_rand(bias.data(), -1.f, 1.f, scale_dim.production()); + + std::vector mean(scale_dim.production()); + fill_data_rand(mean.data(), -1.f, 1.f, scale_dim.production()); + + std::vector variance(scale_dim.production()); + fill_data_rand(variance.data(), 0.f, 1.f, scale_dim.production()); + + SetCommonTensor(input_, dims_, din.data()); + SetCommonTensor(scale_, scale_dim, scale.data()); + SetCommonTensor(bias_, scale_dim, bias.data()); + SetCommonTensor(mean_, scale_dim, mean.data()); + SetCommonTensor(variance_, scale_dim, variance.data()); + } +}; + +TEST(BatchNorm, precision) { + LOG(INFO) << "test BatchNorm op"; + float abs_error = 2e-5; + Place place; +#if defined(LITE_WITH_XPU) + place = TARGET(kXPU); +#elif defined(LITE_WITH_NPU) + place = TARGET(kNPU); +#else + return; +#endif + + for (auto dims : + std::vector>{{1, 2, 3, 4}, {5, 6, 7, 8}}) { + for (auto epsilon : {1e-5f}) { + std::unique_ptr tester( + new BatchNormComputeTest(place, "def", DDim(dims), epsilon)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision( + {"mean_out", "saved_mean", "variance_out", "saved_variance"}); + } + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/grid_sampler_compute_test.cc b/lite/tests/kernels/grid_sampler_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d94c2e5154b88e9455c1c3cf8d937d13e825a858 --- /dev/null +++ b/lite/tests/kernels/pool_compute_test.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { + +class PoolComputeTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string op_type_ = "pool2d"; + std::string x_ = "x"; + std::string out_ = "out"; + DDim dims_{{1, 2, 3, 4}}; + std::string pooling_type_ = "max"; + bool global_pooling_ = false; + std::vector strides_{1, 1}; + std::vector paddings_{0, 0}; + std::vector ksize_{2, 2}; + bool exclusive_ = true; + bool ceil_mode_ = false; + bool adaptive_ = false; + std::string padding_algorithm_; + + public: + PoolComputeTest(const Place& place, + const std::string& alias, + DDim dims, + std::string pooling_type, + bool global_pooling, + std::vector strides = {1, 1}, + std::vector paddings = {0, 0}, + std::vector ksize = {2, 2}, + bool exclusive = true, + bool ceil_mode = false, + bool adaptive = false, + std::string padding_algorithm = "") + : TestCase(place, alias), + dims_(dims), + pooling_type_(pooling_type), + global_pooling_(global_pooling), + strides_(strides), + paddings_(paddings), + ksize_(ksize), + exclusive_(exclusive), + ceil_mode_(ceil_mode), + adaptive_(adaptive) {} + + void RunBaseline(Scope* scope) override { + std::vector paddings_new{paddings_}; + if (paddings_new.size() == 1L) { + paddings_new = std::vector(4, paddings_new[0]); + } else if (paddings_new.size() == 2L) { + paddings_new.insert(paddings_new.begin(), paddings_new[0]); + paddings_new.insert(paddings_new.begin() + 2, paddings_new[2]); + } + CHECK_EQ(paddings_new.size(), 4L); + if (padding_algorithm_ == "SAME") { + for (int i = 0; i < strides_.size(); ++i) { + int out_size = (dims_[i + 2] + strides_[i] - 1) / strides_[i]; + int pad_sum = + std::max((out_size - 1) * strides_[i] + ksize_[i] - dims_[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + *(paddings_new.begin() + i * 2) = pad_0; + *(paddings_new.begin() + i * 2 + 1) = pad_1; + } + } + if (padding_algorithm_ == "VALID" || global_pooling_ || adaptive_) { + for (size_t i = 0; i < paddings_new.size(); i++) { + paddings_new[i] = 0; + } + } + + std::vector ksize_new{ksize_}; + if (global_pooling_) { + ksize_new.clear(); + ksize_new.push_back(dims_[2]); + ksize_new.push_back(dims_[3]); + } + + std::vector out_shape{dims_[0], dims_[1]}; + if (adaptive_) { + out_shape.insert(out_shape.end(), ksize_new.begin(), ksize_new.end()); + } else { + for (size_t i = 0; i < ksize_new.size(); ++i) { + int out_size; + if (!ceil_mode_) { + out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] + + paddings_new[2 * i + 1]) / + strides_[i] + + 1; + } else { + out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] + + paddings_new[2 * i + 1] + strides_[i] - 1) / + strides_[i] + + 1; + } + out_shape.push_back(out_size); + } + } + + auto out = scope->NewTensor(out_); + CHECK(out); + out->Resize(DDim(out_shape)); + auto out_dims = out->dims(); + auto dst_ptr = out->mutable_data(); + + auto x = scope->FindTensor(x_); + auto src_ptr = x->data(); + + int in_n = dims_[0]; + int in_c = dims_[1]; + int in_h = dims_[2]; + int in_w = dims_[3]; + int size_in_n = in_c * in_h * in_w; + int size_in_c = in_h * in_w; + + int out_h = out_dims[2]; + int out_w = out_dims[3]; + int size_out_n = in_c * out_h * out_w; + int size_out_c = out_h * out_w; + + int window_h = ksize_new[0]; + int window_w = ksize_new[1]; + int stride_h = strides_[0]; + int stride_w = strides_[1]; + int pad_t = paddings_new[0]; + int pad_l = paddings_new[2]; + + if (global_pooling_) { + for (int n = 0; n < in_n; ++n) { + for (int c = 0; c < in_c; ++c) { + const float* src = src_ptr + n * size_in_n + c * size_in_c; + float res = src[0]; + if (pooling_type_ == "max") { + for (int i = 1; i < size_in_c; ++i) { + float cur_val = src[i]; + res = cur_val > res ? cur_val : res; + } + } else if (pooling_type_ == "avg") { + for (int i = 1; i < size_in_c; ++i) { + float cur_val = src[i]; + res += cur_val; + } + res /= size_in_c; + } + dst_ptr[n * size_out_n + c] = res; + } + } + } else { + for (int n = 0; n < in_n; ++n) { + for (int c = 0; c < in_c; ++c) { + for (int h = 0; h < out_h; ++h) { + int sh = h * stride_h; + int eh = sh + window_h; + sh = (sh - pad_t) < 0 ? 0 : sh - pad_t; + eh = (eh - pad_t) > in_h ? in_h : eh - pad_t; + for (int w = 0; w < out_w; ++w) { + int sw = w * stride_w; + int ew = sw + window_w; + sw = (sw - pad_l) < 0 ? 0 : sw - pad_l; + ew = (ew - pad_l) > in_w ? in_w : ew - pad_l; + int pooling_size = (ew - sw) * (eh - sh); + if (pooling_size == 0) continue; + float res = 0.f; + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; + if (kh == sh && kw == sw) { + res = src_ptr[src_idx]; + } else { + if (pooling_type_ == "max") { + res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; + } + if (pooling_type_ == "avg") { + res += src_ptr[src_idx]; + } + } + } + } + if (pooling_type_ == "avg") { + if (exclusive_) { + res /= pooling_size; + } else { + res /= window_h * window_w; + } + } + dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; + } + } + } + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType(op_type_); + op_desc->SetInput("X", {x_}); + op_desc->SetOutput("Out", {out_}); + op_desc->SetAttr("pooling_type", pooling_type_); + op_desc->SetAttr("global_pooling", global_pooling_); + op_desc->SetAttr("strides", strides_); + op_desc->SetAttr("paddings", paddings_); + op_desc->SetAttr("ksize", ksize_); + op_desc->SetAttr("exclusive", exclusive_); + op_desc->SetAttr("ceil_mode", ceil_mode_); + op_desc->SetAttr("adaptive", adaptive_); + if (!padding_algorithm_.empty()) { + op_desc->SetAttr("padding_algorithm", padding_algorithm_); + } + } + + void PrepareData() override { + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(x_, dims_, din.data()); + } +}; + +void TestPoolGlobal(Place place, float abs_error = 2e-5) { + for (auto dims : std::vector>{{2, 3, 4, 5}}) { + for (std::string pooling_type : {"max", "avg"}) { + std::unique_ptr tester( + new PoolComputeTest(place, "def", DDim(dims), pooling_type, true)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } + } +} + +void TestPoolAlgorithm(Place place, float abs_error = 2e-5) { + for (auto dims : std::vector>{{2, 3, 4, 5}}) { + for (auto pooling_type : {"max", "avg"}) { + for (auto padding_algorithm : {"SAME", "VALID"}) { + std::unique_ptr tester( + new PoolComputeTest(place, + "def", + DDim(dims), + pooling_type, + false, + {2, 2}, + {0, 0}, + {2, 2}, + true, + false, + false, + padding_algorithm)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } + } + } +} + +void TestPoolHelper(Place place, + float abs_error, + std::vector dims, + std::string pooling_type, + std::vector strides, + std::vector paddings, + std::vector ksize) { + std::unique_ptr tester(new PoolComputeTest( + place, "def", DDim(dims), pooling_type, false, strides, paddings, ksize)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); +} + +void TestPoolStrides(Place place, float abs_error = 2e-5) { + for (auto pooling_type : {"max", "avg"}) { + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2}); + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 2}, {0, 0}, {2, 2}); + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {2, 2}, {0, 0}, {2, 2}); + } +} + +void TestPoolPaddings(Place place, float abs_error = 2e-5) { + for (auto pooling_type : {"max", "avg"}) { + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2}); + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {1, 1}, {2, 2}); + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {1, 1}, + {0, 0, 1, 1}, + {2, 2}); + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {1, 1}, + {1, 0, 1, 0}, + {2, 2}); + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {1, 1}, + {1, 0, 0, 1}, + {2, 2}); + } +} + +void TestPoolKsize(Place place, float abs_error = 2e-5) { + for (auto pooling_type : {"max", "avg"}) { + for (auto ksize : {2, 3}) { + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {1, 1}, + {0, 0}, + {ksize, ksize}); + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {2, 2}, + {1, 1}, + {ksize, ksize}); + } + } +} + +TEST(Pool, precision) { + LOG(INFO) << "test pool op"; + float abs_error = 2e-5; + Place place; +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#else + return; +#endif + + TestPoolGlobal(place, abs_error); + TestPoolAlgorithm(place, abs_error); + TestPoolStrides(place, abs_error); + TestPoolPaddings(place, abs_error); + TestPoolKsize(place, abs_error); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/reduce_prod_compute_test.cc b/lite/tests/kernels/reduce_prod_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc old mode 100755 new mode 100644 index 85cd724148290a06a9303515004e8d003c32c053..b82c291a4167a0c704d72a1814e9544a467d057f --- a/lite/tests/kernels/reshape_compute_test.cc +++ b/lite/tests/kernels/reshape_compute_test.cc @@ -16,6 +16,7 @@ #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" namespace paddle { namespace lite { @@ -29,19 +30,19 @@ class ReshapeComputeTester : public arena::TestCase { std::string xshape_ = "xshape"; std::vector shape_tensor_vct_; std::string shape_tensor_; - DDim x_dims_; + DDim dims_; std::vector shape_; bool inplace_ = false; public: ReshapeComputeTester(const Place& place, const std::string& alias, - DDim x_dims, + DDim dims, std::vector shape, bool is_shape_tensor_vct = false, bool is_shape_tensor = false, bool is_shape = true) - : TestCase(place, alias), x_dims_(x_dims) { + : TestCase(place, alias), dims_(dims) { if (is_shape_tensor_vct) { for (size_t i = 0; i < shape.size(); i++) { shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i)); @@ -60,7 +61,6 @@ class ReshapeComputeTester : public arena::TestCase { CHECK(out); auto* x = scope->FindTensor(input_); - auto x_dims = x->dims(); std::vector out_shape; if (shape_tensor_vct_.size() > 0) { @@ -86,8 +86,8 @@ class ReshapeComputeTester : public arena::TestCase { CHECK_EQ(unk_dim_idx, -1); unk_dim_idx = i; } else if (out_shape[i] == 0) { - CHECK_LE(i, x_dims.size()); - final_out_shape[i] = x_dims[i]; + CHECK_LE(i, dims_.size()); + final_out_shape[i] = dims_[i]; } else if (out_shape[i] > 0) { final_out_shape[i] = out_shape[i]; } else { @@ -97,18 +97,18 @@ class ReshapeComputeTester : public arena::TestCase { } if (unk_dim_idx > -1) { - final_out_shape[unk_dim_idx] = x_dims.production() / cap; + final_out_shape[unk_dim_idx] = dims_.production() / cap; } out->Resize(final_out_shape); auto x_data = x->data(); auto out_data = out->mutable_data(); - memcpy(out_data, x_data, sizeof(float) * x_dims.production()); + memcpy(out_data, x_data, sizeof(float) * dims_.production()); if (op_type_ == "reshape2") { auto* xshape = scope->NewTensor(xshape_); - auto xshape_dims = x_dims.Vectorize(); + auto xshape_dims = dims_.Vectorize(); xshape_dims.insert(xshape_dims.begin(), 0); xshape->Resize(xshape_dims); } @@ -134,11 +134,9 @@ class ReshapeComputeTester : public arena::TestCase { } void PrepareData() override { - std::vector data(x_dims_.production()); - for (int i = 0; i < x_dims_.production(); i++) { - data[i] = i * 1.1; - } - SetCommonTensor(input_, x_dims_, data.data()); + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(input_, dims_, din.data()); if (shape_tensor_vct_.size() > 0) { for (size_t i = 0; i < shape_.size(); i++) { @@ -161,13 +159,16 @@ TEST(Reshape, precision) { LOG(INFO) << "test Reshape op"; float abs_error = 2e-5; Place place; -#ifdef LITE_WITH_XPU +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_XPU) place = TARGET(kXPU); #else return; #endif - DDim x_dims{{2, 3, 4, 5}}; + DDim dims{{2, 3, 4, 5}}; std::vector> shapes{{5, 4, 3, 2}, {2, 3, 20}, {2, 60}, @@ -176,8 +177,11 @@ TEST(Reshape, precision) { {0, 0, 20}, {0, 0, -1}}; for (auto shape : shapes) { +#ifdef LITE_WITH_NPU + if (dims.size() > 4 || shape.size() > 4) continue; +#endif std::unique_ptr tester( - new ReshapeComputeTester(place, "def", x_dims, shape)); + new ReshapeComputeTester(place, "def", dims, shape)); arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision({"xshape"}); } diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc index 706936d2b1e113a2acde888e91c6e0a2f2d6582f..1ededcd52d3fb4c8881a391dce5e7f22e87cdb44 100644 --- a/lite/tests/kernels/scale_compute_test.cc +++ b/lite/tests/kernels/scale_compute_test.cc @@ -16,6 +16,7 @@ #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" namespace paddle { namespace lite { @@ -23,31 +24,33 @@ namespace lite { class ScaleComputeTester : public arena::TestCase { protected: // common attributes for this op. - std::string input_ = "x"; - std::string output_ = "out"; + std::string x_ = "x"; + std::string out_ = "out"; + DDim x_dims_{{100, 20}}; float scale_ = 0.; float bias_ = 0.; - DDim dims_{{100, 20}}; bool bias_after_scale_; public: ScaleComputeTester(const Place& place, const std::string& alias, + const DDim& x_dims, float scale, float bias, bool bias_after_scale) : TestCase(place, alias), + x_dims_(x_dims), scale_(scale), bias_(bias), bias_after_scale_(bias_after_scale) {} void RunBaseline(Scope* scope) override { - auto* out = scope->NewTensor(output_); + auto* out = scope->NewTensor(out_); CHECK(out); - out->Resize(dims_); + out->Resize(x_dims_); auto* out_data = out->mutable_data(); - auto* x = scope->FindTensor(input_); + auto* x = scope->FindTensor(x_); const auto* x_data = x->data(); float bias = bias_; @@ -56,35 +59,34 @@ class ScaleComputeTester : public arena::TestCase { bias *= scale_; } - for (int i = 0; i < dims_.production(); i++) { + for (int i = 0; i < x_dims_.production(); i++) { out_data[i] = x_data[i] * scale_ + bias; } } void PrepareOpDesc(cpp::OpDesc* op_desc) { op_desc->SetType("scale"); - op_desc->SetInput("X", {input_}); - op_desc->SetOutput("Out", {output_}); + op_desc->SetInput("X", {x_}); + op_desc->SetOutput("Out", {out_}); op_desc->SetAttr("scale", scale_); op_desc->SetAttr("bias", bias_); op_desc->SetAttr("bias_after_scale", bias_after_scale_); } void PrepareData() override { - std::vector data(dims_.production()); - - for (int i = 0; i < dims_.production(); i++) { - data[i] = i * 1.1; - } - - SetCommonTensor(input_, dims_, data.data()); + std::vector x(x_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + SetCommonTensor(x_, x_dims_, x.data()); } }; TEST(Scale, precision) { Place place; float abs_error = 2e-5; -#if defined(LITE_WITH_ARM) +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 4e-3; // Using fp16 in NPU +#elif defined(LITE_WITH_ARM) place = TARGET(kARM); #elif defined(LITE_WITH_XPU) place = TARGET(kXPU); @@ -95,13 +97,16 @@ TEST(Scale, precision) { return; #endif - for (float scale : {0.123, 2., -1.2}) { - for (float bias : {1., 0., -1.2331}) { - for (bool bias_before : {true, false}) { - std::unique_ptr tester( - new ScaleComputeTester(place, "def", scale, bias, bias_before)); - arena::Arena arena(std::move(tester), place, abs_error); - arena.TestPrecision(); + for (auto x_dims : + std::vector>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) { + for (float scale : {0.123, 2., -1.2}) { + for (float bias : {1., 0., -1.2331}) { + for (bool bias_after_scale : {true, false}) { + std::unique_ptr tester(new ScaleComputeTester( + place, "def", DDim(x_dims), scale, bias, bias_after_scale)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } } } } @@ -117,8 +122,8 @@ TEST(Scale, performance) { return; #endif - std::unique_ptr tester( - new ScaleComputeTester(place, "def", 1.2, 1.1, true)); + std::unique_ptr tester(new ScaleComputeTester( + place, "def", DDim(std::vector{5, 2, 3, 4}), 1.2, 1.1, true)); // To modify the arm context, one can retrive the context as follows. // #ifdef LITE_WITH_ARM diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc index 66123625fae606a9022537698cdc1032abb13451..66dd7bbe37b6ecc90df60a03543445e86721a938 100644 --- a/lite/tests/kernels/shuffle_channel_compute_test.cc +++ b/lite/tests/kernels/shuffle_channel_compute_test.cc @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine. -// Open this. -/*#include +#include #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" namespace paddle { namespace lite { @@ -40,28 +39,29 @@ class ShuffleChannelComputeTester : public arena::TestCase { auto* out = scope->NewTensor(output_); CHECK(out); out->Resize(dims_); - auto* outputs = out->mutable_data(); + auto* out_data = out->mutable_data(); + auto* x = scope->FindTensor(input_); - const auto* inputs = x->data(); - DDim x_dims = x->dims(); - int num = x->dims()[0]; - int channel = x->dims()[1]; - int height = x->dims()[2]; - int width = x->dims()[3]; - int fea_size = channel * height * width; + const auto* in_data = x->data(); + + int num = dims_[0]; + int channel = dims_[1]; + int height = dims_[2]; + int width = dims_[3]; + int feather_size = channel * height * width; int spatial_size = height * width; - int group_row = group_; - int group_col = channel / group_; - for (int k = 0; k < num; ++k) { - inputs += k * fea_size; - outputs += k * fea_size; - for (int i = 0; i < group_row; ++i) { - for (int j = 0; j < group_col; ++j) { - const float* p_i = inputs + (i * group_col + j) * spatial_size; - float* p_o = outputs + (j * group_row + i) * spatial_size; + int group_num = group_; + int group_size = channel / group_; + for (int n = 0; n < num; n++) { + for (int i = 0; i < group_num; ++i) { + for (int j = 0; j < group_size; ++j) { + const float* p_i = in_data + (i * group_size + j) * spatial_size; + float* p_o = out_data + (j * group_num + i) * spatial_size; memcpy(p_o, p_i, spatial_size * sizeof(float)); } } + in_data += feather_size; + out_data += feather_size; } } @@ -73,35 +73,33 @@ class ShuffleChannelComputeTester : public arena::TestCase { } void PrepareData() override { - std::vector data(dims_.production()); - - for (int i = 0; i < dims_.production(); i++) { - data[i] = i * 1.1; - } - - SetCommonTensor(input_, dims_, data.data()); + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(input_, dims_, din.data()); } }; -void test_shuffle_channel(Place place) { - for (int group : {4}) { +void test_shuffle_channel(Place place, float abs_error = 2e-5) { + for (int group : {2, 4, 8}) { std::unique_ptr tester( new ShuffleChannelComputeTester(place, "def", group)); - arena::Arena arena(std::move(tester), place, 2e-5); + arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision(); } } TEST(ShuffleChannel, precision) { -// #ifdef LITE_WITH_X86 -// Place place(TARGET(kX86)); -// #endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_shuffle_channel(place); + Place place; + float abs_error = 2e-5; +#ifdef LITE_WITH_NPU + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#else + return; #endif + + test_shuffle_channel(place, abs_error); } } // namespace lite } // namespace paddle -*/ diff --git a/lite/tests/kernels/softmax_compute_test.cc b/lite/tests/kernels/softmax_compute_test.cc old mode 100755 new mode 100644 index 94100da2b13aecea8ac50ea7a4ae52350455051f..a91f6534ffa1f8022e2005cc83255d306adf77c1 --- a/lite/tests/kernels/softmax_compute_test.cc +++ b/lite/tests/kernels/softmax_compute_test.cc @@ -25,33 +25,33 @@ class SoftmaxComputeTest : public arena::TestCase { protected: // common attributes for this op. std::string op_type_ = "softmax"; - std::string input_ = "x"; - std::string output_ = "out"; - DDim dims_{{1, 2, 3, 4}}; + DDim x_dims_{{1, 2, 3, 4}}; + std::string x_ = "x"; + std::string out_ = "out"; int axis_ = 1; public: SoftmaxComputeTest(const Place& place, const std::string& alias, - DDim dims, + DDim x_dims, int axis) - : TestCase(place, alias), dims_(dims), axis_(axis) {} + : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {} void RunBaseline(Scope* scope) override { - auto x = scope->FindTensor(input_); - auto out = scope->NewTensor(output_); + auto x = scope->FindTensor(x_); + auto out = scope->NewTensor(out_); CHECK(out); - out->Resize(dims_); + out->Resize(x_dims_); auto x_data = x->data(); auto out_data = out->mutable_data(); - auto x_rank = dims_.size(); + auto x_rank = x_dims_.size(); if (axis_ < 0) { axis_ += x_rank; } - int axis_size = dims_[axis_]; - int outer_num = dims_.Slice(0, axis_).production(); - int inner_num = dims_.Slice(axis_ + 1, x_rank).production(); + int axis_size = x_dims_[axis_]; + int outer_num = x_dims_.Slice(0, axis_).production(); + int inner_num = x_dims_.Slice(axis_ + 1, x_rank).production(); int compute_size = outer_num * inner_num; for (int i = 0; i < compute_size; i++) { int idx_inner = i % inner_num; @@ -84,15 +84,15 @@ class SoftmaxComputeTest : public arena::TestCase { void PrepareOpDesc(cpp::OpDesc* op_desc) { op_desc->SetType(op_type_); - op_desc->SetInput("X", {input_}); - op_desc->SetOutput("Out", {output_}); + op_desc->SetInput("X", {x_}); + op_desc->SetOutput("Out", {out_}); op_desc->SetAttr("axis", axis_); } void PrepareData() override { - std::vector din(dims_.production()); - fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); - SetCommonTensor(input_, dims_, din.data()); + std::vector x(x_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + SetCommonTensor(x_, x_dims_, x.data()); } }; @@ -100,18 +100,21 @@ TEST(Softmax, precision) { LOG(INFO) << "test softmax op"; float abs_error = 2e-5; Place place; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 4e-3; // Using fp16 in NPU +#elif defined(LITE_WITH_XPU) place = TARGET(kXPU); #else return; #endif - std::vector> dims{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}; - for (auto dim_in : dims) { + for (auto x_dims : + std::vector>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) { for (auto axis : {-1, 0, 1, 2, 3}) { - if (axis >= dim_in.size()) continue; + if (axis >= x_dims.size()) continue; std::unique_ptr tester( - new SoftmaxComputeTest(place, "def", DDim(dim_in), axis)); + new SoftmaxComputeTest(place, "def", DDim(x_dims), axis)); arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision(); } diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc old mode 100755 new mode 100644 index 62e0fc8e410092975eed3ea5fec441a7859de81f..b4407bb5690fe8c1f4305cea584f9abf5af121bb --- a/lite/tests/kernels/transpose_compute_test.cc +++ b/lite/tests/kernels/transpose_compute_test.cc @@ -16,6 +16,7 @@ #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" namespace paddle { namespace lite { @@ -24,13 +25,13 @@ int data_index(std::vector pos, DDimLite dims) { int d1 = dims[1]; int d2 = dims[2]; int d3 = dims[3]; - return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1; + return pos[0] * d1 * d2 * d3 + pos[1] * d2 * d3 + pos[2] * d3 + pos[3]; } std::vector pos_trans(std::vector in_pos, std::vector axis) { std::vector out_pos(in_pos.size()); for (int i = 0; i < axis.size(); i++) { - out_pos[axis[i]] = in_pos[i]; + out_pos[i] = in_pos[axis[i]]; } return out_pos; } @@ -42,35 +43,34 @@ class TransposeComputeTester : public arena::TestCase { std::string input_ = "x"; std::string output_ = "out"; std::string xshape_ = "xshape"; - DDim x_dims_; + DDim dims_; std::vector axis_; public: TransposeComputeTester(const Place& place, const std::string& alias, - DDim x_dims, + DDim dims, std::vector axis) - : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {} + : TestCase(place, alias), dims_(dims), axis_(axis) {} void RunBaseline(Scope* scope) override { auto* out = scope->NewTensor(output_); CHECK(out); auto* x = scope->FindTensor(input_); - auto x_dims = x->dims(); - std::vector out_shape(x_dims.size(), 0); - for (size_t i = 0; i < x_dims.size(); i++) { - out_shape[i] = x_dims[axis_[i]]; + std::vector out_shape(dims_.size(), 0); + for (size_t i = 0; i < dims_.size(); i++) { + out_shape[i] = dims_[axis_[i]]; } out->Resize(out_shape); auto y_dims = out->dims(); - int input_n = x_dims[0]; - int input_c = x_dims[1]; - int input_h = x_dims[2]; - int input_w = x_dims[3]; + int input_n = dims_[0]; + int input_c = dims_[1]; + int input_h = dims_[2]; + int input_w = dims_[3]; auto input_data = x->data(); auto output_data = out->mutable_data(); @@ -81,7 +81,7 @@ class TransposeComputeTester : public arena::TestCase { for (int w = 0; w < input_w; ++w) { std::vector in_pos{n, c, h, w}; std::vector out_pos = pos_trans(in_pos, axis_); - int in_index = data_index(in_pos, x_dims); + int in_index = data_index(in_pos, dims_); int out_index = data_index(out_pos, y_dims); output_data[out_index] = input_data[in_index]; } @@ -91,7 +91,7 @@ class TransposeComputeTester : public arena::TestCase { if (op_type_ == "transpose2") { auto* xshape = scope->NewTensor(xshape_); - auto xshape_dims = x_dims.Vectorize(); + auto xshape_dims = dims_.Vectorize(); xshape_dims.insert(xshape_dims.begin(), 0); xshape->Resize(xshape_dims); } @@ -108,11 +108,9 @@ class TransposeComputeTester : public arena::TestCase { } void PrepareData() override { - std::vector data(x_dims_.production()); - for (int i = 0; i < x_dims_.production(); i++) { - data[i] = i * 1.1; - } - SetCommonTensor(input_, x_dims_, data.data()); + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(input_, dims_, din.data()); } }; @@ -122,14 +120,16 @@ TEST(Transpose, precision) { Place place; #ifdef LITE_WITH_XPU place = TARGET(kXPU); +#elif defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU #else return; #endif DDim x_dims{{2, 3, 4, 5}}; - // [XPU]: {3, 1, 0, 2} is unsupported std::vector> axes{ - {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}}; + {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}, {3, 1, 0, 2}}; for (auto axis : axes) { std::unique_ptr tester( new TransposeComputeTester(place, "def", x_dims, axis)); diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc index 590d3fd29c37e16cfeec53557a825a4acf9684ca..d8ec2b01f787f32a00d645725717b412ef8a953a 100644 --- a/lite/tests/kernels/unsqueeze_compute_test.cc +++ b/lite/tests/kernels/unsqueeze_compute_test.cc @@ -223,67 +223,73 @@ class Unsqueeze2ComputeTester : public arena::TestCase { } }; -void test_unsqueeze(Place place) { +void test_unsqueeze(Place place, float abs_error = 2e-5) { for (std::vector axes : {std::vector({1}), std::vector({0, 2}), std::vector({0, -2})}) { - for (int N : {1}) { - for (int C : {3}) { - for (int H : {1}) { - for (int W : {5}) { - for (int input_axes_flag : {1, 2, 3}) { - LOG(INFO) << N << " " << C << " " << H << " " << W << " " - << input_axes_flag; - std::unique_ptr tester( - new UnsqueezeComputeTester( - place, "def", axes, DDim({N, C, H, W}), input_axes_flag)); - arena::Arena arena(std::move(tester), place, 2e-5); - arena.TestPrecision(); - } - } - } + for (auto dims : std::vector>{{3}, {3, 5}, {3, 5, 7}}) + for (int input_axes_flag : {1, 2, 3}) { +#ifdef LITE_WITH_NPU + if (input_axes_flag != 1) continue; + if (dims.size() + axes.size() > 4) continue; +#endif + std::unique_ptr tester(new UnsqueezeComputeTester( + place, "def", axes, DDim(dims), input_axes_flag)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); } - } } } -void test_unsqueeze2(Place place) { +void test_unsqueeze2(Place place, + float abs_error = 2e-5, + std::vector ignored_outs = {}) { for (std::vector axes : {std::vector({0}), std::vector({0, 2}), std::vector({0, -2})}) { - for (int N : {1}) { - for (int C : {3}) { - for (int H : {1}) { - for (int W : {5}) { - std::unique_ptr tester(new Unsqueeze2ComputeTester( - place, "def", axes, DDim({N, C, H, W}))); - arena::Arena arena(std::move(tester), place, 2e-5); - arena.TestPrecision(); - } - } - } + for (auto dims : + std::vector>{{3}, {3, 5}, {3, 5, 7}}) { +#ifdef LITE_WITH_NPU + if (dims.size() + axes.size() > 4) continue; +#endif + std::unique_ptr tester( + new Unsqueeze2ComputeTester(place, "def", axes, DDim(dims))); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(ignored_outs); } } } TEST(squeeze, precision) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_unsqueeze(place); + Place place; + float abs_error = 2e-5; +#ifdef LITE_WITH_NPU + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_ARM) + place = TARGET(kARM); +#else + return; #endif + + test_unsqueeze(place, abs_error); } TEST(squeeze2, precision) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_unsqueeze2(place); + Place place; + float abs_error = 2e-5; + std::vector ignored_outs = {}; +#ifdef LITE_WITH_NPU + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU + ignored_outs.push_back("XShape"); // not supported out in NPU +#elif defined(LITE_WITH_ARM) + place = TARGET(kARM); +#else + return; #endif + + test_unsqueeze2(place, abs_error, ignored_outs); } } // namespace lite diff --git a/lite/tests/utils/timer.h b/lite/tests/utils/timer.h deleted file mode 100644 index 095f32046e0dc5b9342163e1f4f13f4e30c10670..0000000000000000000000000000000000000000 --- a/lite/tests/utils/timer.h +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include - -namespace paddle { -namespace lite { - -class Timer final { - public: - Timer() {} - - ~Timer() {} - - void clear() { ms_time_.clear(); } - - void start() { tstart_ = std::chrono::system_clock::now(); } - - void end() { - tend_ = std::chrono::system_clock::now(); - auto ts = - std::chrono::duration_cast(tend_ - tstart_); - latest_time_ = 1000.f * static_cast(ts.count()) * - std::chrono::microseconds::period::num / - std::chrono::microseconds::period::den; - ms_time_.push_back(latest_time_); - } - - float latest_time() const { return latest_time_; } - - float get_average_ms() { - if (ms_time_.size() == 0) { - return 0.f; - } - float sum = 0.f; - for (auto i : ms_time_) { - sum += i; - } - return sum / ms_time_.size(); - } - - float get_sum_ms() { - if (ms_time_.size() == 0) { - return 0.f; - } - float sum = 0.f; - for (auto i : ms_time_) { - sum += i; - } - return sum; - } - - // return tile (0-99) time. - float get_tile_time(float tile) { - if (tile < 0 || tile > 100) { - return -1.f; - } - int total_items = static_cast(ms_time_.size()); - if (total_items <= 0) { - return -2.f; - } - ms_time_.sort(); - int pos = static_cast(tile * total_items / 100); - auto it = ms_time_.begin(); - for (int i = 0; i < pos; ++i) { - ++it; - } - return *it; - } - - std::list get_time_stat() { return ms_time_; } - - float get_min_time() { - ms_time_.sort(); - return *ms_time_.begin(); - } - - float get_max_time() { - ms_time_.sort([](int a, int b) { return a > b; }); - return *ms_time_.begin(); - } - - private: - std::chrono::time_point tstart_; - std::chrono::time_point tend_; - std::list ms_time_; - float latest_time_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh deleted file mode 100755 index f4cfee5ec6b9256d94377cc8814ad73f64ca0546..0000000000000000000000000000000000000000 --- a/lite/tools/build_bm.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash -set -ex - -# global variables with default value -BM_SDK_ROOT="$(pwd)/../BM_SDK" # BM SDK -TARGET_NAME="BM1682" # default target -BUILD_EXTRA=OFF # ON(with sequence ops)/OFF -WITH_TESTING=ON # ON/OFF - -function print_usage { - echo -e "\nUSAGE:" - echo - echo "----------------------------------------" - echo -e "--bm_sdk_root=" - echo -e "--target_name=" - echo "----------------------------------------" - echo -} - -# readonly variables with default value -readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ - -DWITH_PYTHON=OFF \ - -DLITE_WITH_ARM=OFF" - -readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THRLITE_BUILD_THREADSEADS:-1} - -readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz -readonly workspace=$(pwd) - -function prepare_thirdparty { - if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then - rm -rf $workspace/third-party - - if [ ! -f $workspace/third-party-05b862.tar.gz ]; then - wget $THIRDPARTY_TAR - fi - tar xzf third-party-05b862.tar.gz - else - git submodule update --init --recursive - fi -} - -# for code gen, a source file is generated after a test, but is dependended by some targets in cmake. -# here we fake an empty file to make cmake works. -function prepare_workspace { - # in build directory - # 1. Prepare gen_code file - GEN_CODE_PATH_PREFIX=lite/gen_code - mkdir -p ./${GEN_CODE_PATH_PREFIX} - touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc - - # 2.Prepare debug tool - DEBUG_TOOL_PATH_PREFIX=lite/tools/debug - mkdir -p ./${DEBUG_TOOL_PATH_PREFIX} - cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/ - - # clone submodule - # git submodule update --init --recursive - prepare_thirdparty -} - -function build_bm { - build_dir=${workspace}/build.lite.bm - mkdir -p $build_dir - cd $build_dir - - prepare_workspace - cmake .. \ - ${CMAKE_COMMON_OPTIONS} \ - -DWITH_GPU=OFF \ - -DWITH_MKLDNN=OFF \ - -DLITE_WITH_X86=ON \ - -DWITH_MKL=ON \ - -DLITE_BUILD_EXTRA=ON \ - -DLITE_WITH_XPU=OFF \ - -DLITE_WITH_BM=ON \ - -DWITH_TESTING=${WITH_TESTING} \ - -DBM_SDK_ROOT=${BM_SDK_ROOT} - - make -j$NUM_CORES_FOR_COMPILE - - cd - - echo "Done" -} - -function main { - # Parse command line. - for i in "$@"; do - case $i in - --target_name=*) - TARGET_NAME="${i#*=}" - shift - ;; - --bm_sdk_root=*) - BM_SDK_ROOT="${i#*=}" - shift - ;; - bm) - build_bm - shift - ;; - *) - # unknown option - print_usage - exit 1 - ;; - esac - done -} - -main $@ diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh index 9f28274471f0d0a06c755e41a4a92448a96873af..fdf287501e8f4411f51e73c55b789753f2e85674 100755 --- a/lite/tools/build_xpu.sh +++ b/lite/tools/build_xpu.sh @@ -104,6 +104,11 @@ function main { build_xpu shift ;; + full_publish) + TARGET_NAME=publish_inference + build_xpu + shift + ;; *) # unknown option print_usage diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index 91afc5039cf1c863038cb6c8c5ce79aa856edf04..a0273efe13512e38e157dda76401f8946f79880f 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -610,6 +610,44 @@ function build_arm { } +# $1: ARM_TARGET_OS in "ios", "ios64" +# $2: ARM_TARGET_ARCH_ABI in "armv7", "armv8" +function build_ios { + local os=$1 + local abi=$2 + build_dir=build.ios.${os}.${abi} + echo "building ios target into $build_dir" + echo "target os: $os" + echo "target abi: $abi" + mkdir -p ${build_dir} + cd ${build_dir} + GEN_CODE_PATH_PREFIX=lite/gen_code + mkdir -p ./${GEN_CODE_PATH_PREFIX} + touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc + + cmake .. \ + -DWITH_GPU=OFF \ + -DWITH_MKL=OFF \ + -DWITH_LITE=ON \ + -DLITE_WITH_CUDA=OFF \ + -DLITE_WITH_X86=OFF \ + -DLITE_WITH_ARM=ON \ + -DWITH_TESTING=OFF \ + -DLITE_WITH_JAVA=OFF \ + -DLITE_SHUTDOWN_LOG=ON \ + -DLITE_ON_TINY_PUBLISH=ON \ + -DLITE_WITH_OPENMP=OFF \ + -DWITH_ARM_DOTPROD=OFF \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DARM_TARGET_ARCH_ABI=$abi \ + -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ + -DLITE_WITH_CV=$BUILD_CV \ + -DARM_TARGET_OS=$os + + make -j4 publish_inference + cd - +} + # $1: ARM_TARGET_OS in "android" # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" # $3: ARM_TARGET_LANG in "gcc" "clang" @@ -773,6 +811,21 @@ function build_test_arm_subtask_armlinux { echo "Done" } +# sub-task3 +# this task will test IOS compiling, which requires cmake_version>=3.15 +function build_test_arm_subtask_ios { + cur=$PWD + # job 8 + build_ios "ios" "armv7" + cd $cur + + # job 9 + build_ios "ios64" "armv8" + cd $cur + + echo "Done" +} + # this method need to invoke `build_test_arm_subtask_android` first. function build_test_arm_subtask_model { # We just test following single one environment to limit the CI time. @@ -1042,6 +1095,10 @@ function main { build_test_arm_subtask_armlinux shift ;; + build_test_arm_subtask_ios) + build_test_arm_subtask_ios + shift + ;; check_style) check_style shift diff --git a/lite/tools/cmake_tools/create_fake_kernel_registry.py b/lite/tools/cmake_tools/create_fake_kernel_registry.py index 140d77320704f62dfb2492eec3ad7238fe3868ff..35012d5b163aac2b6998790b4cfcf31e16cb1454 100644 --- a/lite/tools/cmake_tools/create_fake_kernel_registry.py +++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py @@ -18,6 +18,9 @@ import logging from ast import RegisterLiteKernelParser from utils import * +if len(sys.argv) != 4: + print("Error: create_fake_kernel_registry.py requires three inputs!") + exit(1) ops_list_path = sys.argv[1] dest_path = sys.argv[2] kernelmap_path = sys.argv[3] diff --git a/lite/tools/cmake_tools/parse_kernel_registry.py b/lite/tools/cmake_tools/parse_kernel_registry.py index f4f0b95483687d3785168c132d30ac8a4fa87c8e..6c020ec438682b670e4e36a926095fed5452ec37 100644 --- a/lite/tools/cmake_tools/parse_kernel_registry.py +++ b/lite/tools/cmake_tools/parse_kernel_registry.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function import sys import logging from ast import RegisterLiteKernelParser +if len(sys.argv) != 5: + print("Error: parse_kernel_registry.py requires four inputs!") + exit(1) ops_list_path = sys.argv[1] dest_path = sys.argv[2] minkernels_list_path = sys.argv[3] diff --git a/lite/tools/cmake_tools/parse_op_registry.py b/lite/tools/cmake_tools/parse_op_registry.py index db58c455a9d5863ec0c66d7783871831c73c120f..7eb3337ed87b708102b2032de9a279fcae2d321c 100644 --- a/lite/tools/cmake_tools/parse_op_registry.py +++ b/lite/tools/cmake_tools/parse_op_registry.py @@ -13,10 +13,14 @@ # limitations under the License. ''' Collect op registry information. ''' +from __future__ import print_function import sys import logging from ast import RegisterLiteOpParser +if len(sys.argv) != 5: + print("Error: parse_op_registry.py requires four inputs!") + exit(1) ops_list_path = sys.argv[1] dest_path = sys.argv[2] minops_list_path = sys.argv[3] diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f6a3af6bd3e5a2decfb6b3b65b0357bff8b4a378 --- /dev/null +++ b/lite/tools/cmake_tools/record_supported_kernel_op.py @@ -0,0 +1,129 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys +import logging +from ast import RegisterLiteKernelParser +from ast import RegisterLiteOpParser + +if len(sys.argv) != 4: + print("Error: record_supported_kernel_op.py requires three inputs!") + exit(1) +kernels_list_path = sys.argv[1] +ops_list_path = sys.argv[2] +kernel_op_map_dest_path = sys.argv[3] + + +out_lines = [ +''' +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +const std::vector> supported_ops_target = { +''' +] + +ops_lines=[] + +# valid targets and valid_ops +valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"] +valid_ops = [[],[],[],[],[],[],[],[],[],[]] +class TargetType: + kUnk = 0 + kHost = 1 + kX86 = 2 + kCUDA = 3 + kARM = 4 + kOpenCL = 5 + kFPGA = 7 + kNPU = 8 + kXPU = 9 + kAny = 6 # any target + +# record op_info of valid kernels into `valid_ops` according to different target type +with open(kernels_list_path) as f: + paths = set([path for path in f]) + for path in paths: + with open(path.strip()) as g: + c = g.read() + kernel_parser = RegisterLiteKernelParser(c) + kernel_parser.parse() + for k in kernel_parser.kernels: + if hasattr(TargetType, k.target): + index=getattr(TargetType, k.target) + valid_ops[index].append(k.op_type) + +# clear the repeated ops +for target in valid_targets: + index = getattr(TargetType, target) + valid_ops[index] = list(set(valid_ops[index])) + +paths = set() +with open(ops_list_path) as f: + paths = set([path for path in f]) + for path in paths: + str_info = open(path.strip()).read() + op_parser = RegisterLiteOpParser(str_info) + ops = op_parser.parse() + for op in ops: + if "_grad" in op: + continue + out = ' {"%s", { "' % op + op_targets = [] + for target in valid_targets: + if op in valid_ops[getattr(TargetType, target)]: + op_targets.append(target) + if len(op_targets) > 0: + out = out +'", "'.join(op_targets)+ '" }}' + else: + # unknow type op: kUnk = 0 + valid_ops[0].append(op) + out = out +'kUnk" }}' + ops_lines.append(out) + +with open(kernel_op_map_dest_path, 'w') as f: + logging.info("write kernel list to %s" % kernel_op_map_dest_path) + f.write('\n'.join(out_lines)) + # write kernels into head file + for target in valid_targets: + if len(valid_ops[getattr(TargetType, target)]) == 0 : + f.write("\n // %s_OPS: " %target) + f.write('\n {},') + else: + f.write("\n // %s_OPS: " %target) + f.write('\n {"') + f.write('","'.join(valid_ops[getattr(TargetType, target)])) + f.write('"},\n') + f.write('};') + # write op info into head file + f.write('\nconst std::map> supported_ops={\n') + f.write(',\n'.join(ops_lines)) + f.write('\n};') diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt index 0edcb2ef24ce4f53ffffa14ad70cbbc1d5c5971e..6c88e70de125b650bcf576fd686373c59e37454c 100644 --- a/lite/utils/cv/CMakeLists.txt +++ b/lite/utils/cv/CMakeLists.txt @@ -1,5 +1,4 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) - set(lite_cv_deps) lite_cc_library(paddle_cv_arm SRCS image_convert.cc paddle_image_preprocess.cc @@ -7,5 +6,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ image_flip.cc image_rotate.cc image_resize.cc - DEPS ${lite_cv_deps} paddle_api place) + DEPS paddle_api place) endif() diff --git a/lite/utils/cv/image2tensor.cc b/lite/utils/cv/image2tensor.cc index b51a82da1d0e9dc1750670ef55690e9a34a659fc..3a09039a0f53c9ac49a472b61b477dd6d2e5ac33 100644 --- a/lite/utils/cv/image2tensor.cc +++ b/lite/utils/cv/image2tensor.cc @@ -18,6 +18,13 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void gray_to_tensor(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales); + void bgr_to_tensor_chw(const uint8_t* src, float* output, int width, @@ -52,7 +59,7 @@ void bgra_to_tensor_hwc(const uint8_t* src, * NCHW * param src: input image data * param dstTensor: output tensor data - * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param layout: output tensor layout,support NHWC and NCHW @@ -79,6 +86,9 @@ void Image2Tensor::choose(const uint8_t* src, } else if (layout == LayoutType::kNHWC && (srcFormat == BGRA || srcFormat == RGBA)) { impl_ = bgra_to_tensor_hwc; + } else if ((layout == LayoutType::kNHWC || layout == LayoutType::kNCHW) && + (srcFormat == GRAY)) { + impl_ = gray_to_tensor; } else { printf("this layout: %d or image format: %d not support \n", static_cast(layout), @@ -87,6 +97,147 @@ void Image2Tensor::choose(const uint8_t* src, } impl_(src, output, srcw, srch, means, scales); } + +void gray_to_tensor(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales) { + int size = width * height; + float mean_val = means[0]; + float scale_val = scales[0]; + + int dim16 = width >> 16; + int remain = width % 16; + + float32x4_t vmean = vdupq_n_f32(mean_val); + float32x4_t vscale = vdupq_n_f32(scale_val); +#pragma omp parallel for + for (int i = 0; i < height; i += 1) { + const uint8_t* din_ptr = src + i * width; + float* ptr_h = output + i * width; + int cnt = dim16; + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr0], #64] \n" + "prfm pldl1keep, [%[inptr0], #128] \n" + "prfm pldl1keep, [%[inptr0], #192] \n" + "1: \n" + "ld1 {v0.8b}, [%[inptr0]], #8 \n" // d8 = y0y1y2.." + "ld1 {v1.8b}, [%[inptr0]], #8 \n" // d8 = y0y1y2.." + // 8->16 + "ushll v3.8h, v0.8b, #0 \n" + "ushll v4.8h, v0.8b, #0 \n" + // 16->32 + "ushll v6.4s, v3.4h, #0 \n" + "ushll2 v7.4s, v3.8h, #0 \n" + "ushll v8.4s, v4.4h, #0 \n" + "ushll2 v9.4s, v4.8h, #0 \n" + // int32->fp32 + "ucvtf v12.4s, v6.4s \n" + "ucvtf v13.4s, v7.4s \n" + "ucvtf v14.4s, v8.4s \n" + "ucvtf v15.4s, v9.4s \n" + // sub -mean + "fsub v12.4s, v12.4s, %w[vmean].4s \n" + "fsub v13.4s, v13.4s, %w[vmean].4s \n" + "fsub v14.4s, v14.4s, %w[vmean].4s \n" + "fsub v15.4s, v15.4s, %w[vmean].4s \n" + // mul * scale + "fmul v6.4s, v12.4s, %w[vscale].4s \n" + "fmul v7.4s, v13.4s, %w[vscale].4s \n" + "fmul v8.4s, v14.4s, %w[vscale].4s \n" + "fmul v9.4s, v15.4s, %w[vscale].4s \n" + // store + "st1 {v6.4s}, [%[outr0]], #16 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "st1 {v7.4s}, [%[outr0]], #16 \n" + "st1 {v8.4s}, [%[outr0]], #16 \n" + "st1 {v9.4s}, [%[outr0]], #16 \n" + "bne 1b \n" + : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt) + : [vmean] "w"(vmean), [vscale] "w"(vscale) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile( + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr0], #64] @ preload a, 64byte\n" + "pld [%[inptr0], #128] @ preload a, 64byte\n" + "pld [%[inptr0], #192] @ preload a, 64byte\n" + "1: \n" + "vld1.8 {d12, d13}, [%[inptr0]]! \n" + // 8->16 + "vmovl.u8 q8, d12 \n" + "vmovl.u8 q9, d13 \n" + // 16->32 + "vmovl.u16 q11, d16 \n" + "vmovl.u16 q12, d17 \n" + "vmovl.u16 q13, d18 \n" + "vmovl.u16 q14, d19 \n" + // int32->fp32 + "vcvt.f32.u32 q7, q11 \n" + "vcvt.f32.u32 q8, q12 \n" + "vcvt.f32.u32 q9, q13 \n" + "vcvt.f32.u32 q10, q14 \n" + // sub -mean + "vsub.f32 q7, q7, %q[vmean] \n" + "vsub.f32 q8, q8, %q[vmean] \n" + "vsub.f32 q9, q9, %q[vmean] \n" + "vsub.f32 q10, q10, %q[vmean] \n" + // mul *scale + "vmul.f32 q11, q7, %q[vscale] \n" + "vmul.f32 q12, q8, %q[vscale] \n" + "vmul.f32 q13, q9, %q[vscale] \n" + "vmul.f32 q14, q10, %q[vscale] \n" + // store + "vst1.32 {d22 - d23}, [%[outr0]]! \n" + "subs %[cnt], #1 \n" + "vst1.32 {d24 - d25}, [%[outr0]]! \n" + "vst1.32 {d26 - d27}, [%[outr0]]! \n" + "vst1.32 {d28 - d29}, [%[outr0]]! \n" + "bne 1b" + : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt) + : [vmean] "w"(vmean), [vscale] "w"(vscale) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14"); +#endif + } + for (int j = 0; j < remain; j++) { + *ptr_h++ = (*din_ptr - mean_val) * scale_val; + din_ptr++; + } + } +} + void bgr_to_tensor_chw(const uint8_t* src, float* output, int width, @@ -390,6 +541,7 @@ void bgra_to_tensor_chw(const uint8_t* src, } } } + void bgr_to_tensor_hwc(const uint8_t* src, float* output, int width, diff --git a/lite/utils/cv/image_convert.cc b/lite/utils/cv/image_convert.cc index 24b6db70dd4f4fb1ad8e8c915444684d4db07cfd..385f56d233cb151445a086ed59d5c40374cd8c36 100644 --- a/lite/utils/cv/image_convert.cc +++ b/lite/utils/cv/image_convert.cc @@ -30,10 +30,14 @@ void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgra rgba to gray +void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgr rgb to gray void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch); // gray to bgr rgb void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// gray to bgra rgba +void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgr to bgra or rgb to rgba void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgra to bgr or rgba to rgb @@ -112,6 +116,12 @@ void ImageConvert::choose(const uint8_t* src, } else if ((srcFormat == RGB && dstFormat == BGRA) || (srcFormat == BGR && dstFormat == RGBA)) { impl_ = hwc3_trans_hwc4; + } else if ((srcFormat == GRAY && dstFormat == RGBA) || + (srcFormat == GRAY && dstFormat == BGRA)) { + impl_ = hwc1_to_hwc4; + } else if ((srcFormat == RGBA && dstFormat == GRAY) || + (srcFormat == BGRA && dstFormat == GRAY)) { + impl_ = hwc4_to_hwc1; } else { printf("srcFormat: %d, dstFormat: %d does not support! \n", srcFormat, @@ -989,7 +999,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { "vshrn.u32 d24, q6, #7 \n" "vshrn.u32 d25, q7, #7 \n" "vshrn.u32 d26, q8, #7 \n" - "vshrn.u32 d27, q8, #7 \n" + "vshrn.u32 d27, q9, #7 \n" // 16->8 "vmovn.u16 d4, q10 \n" "vmovn.u16 d5, q11 \n" @@ -1077,6 +1087,280 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } /* +采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R +采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B +b = 0.114 *128 = 14.529 = 15 +g = 0.587 * 128 = 75.136 = 75 +r = 0.2989 * 127 = 38.2592 = 38 +Gray = (15*B + 75*G + 38*R)/128 +bgra2gray, rgba2gray +*/ +void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + uint8_t b = 15; + uint8_t g = 75; + uint8_t r = 38; + + uint8x8_t vb = vdup_n_u8(b); + uint8x8_t vg = vdup_n_u8(g); + uint8x8_t vr = vdup_n_u8(r); +#ifdef __aarch64__ +#else + uint8_t vb_array[8] = {b, b, b, b, b, b, b, b}; + uint8_t vg_array[8] = {g, g, g, g, g, g, g, g}; + uint8_t vr_array[8] = {r, r, r, r, r, r, r, r}; +#endif + int cnt_pro = srcw >> 3; + int remain_pro = srcw % 8; + int win = srcw * 4; + int i = 0; +#pragma omp parallel for + for (i = 0; i < srch - 3; i += 4) { + int j = 0; + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + uint8_t* outr0 = dst + i * srcw; + uint8_t* outr1 = outr0 + srcw; + uint8_t* outr2 = outr1 + srcw; + uint8_t* outr3 = outr2 + srcw; + + int cnt = cnt_pro; + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr0], #128] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr1], #128] \n" + "prfm pldl1keep, [%[inptr2]] \n" + "prfm pldl1keep, [%[inptr2], #128] \n" + "prfm pldl1keep, [%[inptr3]] \n" + "prfm pldl1keep, [%[inptr3], #128] \n" + "1: \n" + "ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v4.8b - v7.8b}, [%[inptr1]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v8.8b - v11.8b}, [%[inptr2]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + // mul b + "umull v13.8h, v0.8b, %w[vb].8b \n" // v0 * vb + "umull v14.8h, v4.8b, %w[vb].8b \n" // v0 * vb + "umull v15.8h, v8.8b, %w[vb].8b \n" // v0 * vb + "umull v16.8h, v12.8b, %w[vb].8b \n" // v0 * vb + // mul g + "umull v17.8h, v1.8b, %w[vg].8b \n" // v0 * vb + "umull v18.8h, v5.8b, %w[vg].8b \n" // v0 * vb + "umull v19.8h, v9.8b, %w[vg].8b \n" // v0 * vb + "umull v20.8h, v13.8b, %w[vg].8b \n" // v0 * vb + // mul r + "umlal v13.8h, v2.8b, %w[vr].8b \n" // v0 * vb + "umlal v14.8h, v6.8b, %w[vr].8b \n" // v0 * vb + "umlal v15.8h, v10.8b, %w[vr].8b \n" // v0 * vb + "umlal v16.8h, v14.8b, %w[vr].8b \n" // v0 * vb + // 16->32 + "uaddl v0.4s, v17.4h, v13.4h \n" + "uaddl2 v1.4s, v17.8h, v13.8h \n" + "uaddl v2.4s, v18.4h, v14.4h \n" + "uaddl2 v3.4s, v18.8h, v14.8h \n" + "uaddl v4.4s, v19.4h, v15.4h \n" + "uaddl2 v5.4s, v19.8h, v15.8h \n" + "uaddl v6.4s, v20.4h, v16.4h \n" + "uaddl2 v7.4s, v20.8h, v16.8h \n" + // 32->16 v0 >> 7 + "shrn v12.4h, v0.4s, #7 \n" + "shrn2 v12.8h, v1.4s, #7 \n" + "shrn v13.4h, v2.4s, #7 \n" + "shrn2 v13.8h, v3.4s, #7 \n" + "shrn v14.4h, v4.4s, #7 \n" + "shrn2 v14.8h, v5.4s, #7 \n" + "shrn v15.4h, v6.4s, #7 \n" + "shrn2 v15.8h, v7.4s, #7 \n" + // 16->8 + "xtn v0.8b, v12.8h \n" + "xtn v1.8b, v13.8h \n" + "xtn v2.8b, v14.8h \n" + "xtn v3.8b, v15.8h \n" + "subs %w[cnt], %w[cnt], #1 \n" + "st1 {v0.8b}, [%[outr0]], #8 \n" + "st1 {v1.8b}, [%[outr1]], #8 \n" + "st1 {v2.8b}, [%[outr2]], #8 \n" + "st1 {v3.8b}, [%[outr3]], #8 \n" + "bne 1b \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outr0] "+r"(outr0), + [outr1] "+r"(outr1), + [outr2] "+r"(outr2), + [outr3] "+r"(outr3), + [cnt] "+r"(cnt) + : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); +#else + asm volatile( + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr0], #128] @ preload a, 64byte\n" + "pld [%[inptr1]] @ preload a, 64byte\n" + "pld [%[inptr1], #128] @ preload a, 64byte\n" + "pld [%[inptr2]] @ preload a, 64byte\n" + "pld [%[inptr2], #128] @ preload a, 64byte\n" + "pld [%[inptr3]] @ preload a, 64byte\n" + "pld [%[inptr3], #128] @ preload a, 64byte\n" + "vld1.8 d0, [%[vb]] \n" + "vld1.8 d1, [%[vg]] \n" + "vld1.8 d2, [%[vr]] \n" + "1: \n" + "vld4.8 {d3, d4, d5, d6}, [%[inptr0]]! \n" + "vld4.8 {d7, d8, d9, d10}, [%[inptr1]]! \n" + "vld4.8 {d11, d12, d13, d14}, [%[inptr2]]! \n" + "vld4.8 {d15, d16, d17, d18}, [%[inptr3]]! \n" + // vb + "vmull.u8 q10, d3, d0 \n" + "vmull.u8 q11, d7, d0 \n" + "vmull.u8 q12, d11, d0 \n" + "vmull.u8 q13, d15, d0 \n" + // vg + "vmull.u8 q14, d4, d1 \n" + "vmull.u8 q15, d8, d1 \n" + "vmull.u8 q5, d12, d1 \n" + "vmull.u8 q7, d16, d1 \n" + // vr + "vmlal.u8 q10, d5, d2 \n" + "vmlal.u8 q11, d9, d2 \n" + "vmlal.u8 q12, d13, d2 \n" + "vmlal.u8 q13, d17, d2 \n" + // 16->32 + "vaddl.u16 q2, d28, d20 \n" + "vaddl.u16 q3, d29, d21 \n" + "vaddl.u16 q4, d30, d22 \n" + "vaddl.u16 q10, d31, d23 \n" + "vaddl.u16 q6, d10, d24 \n" + "vaddl.u16 q11, d11, d25 \n" + "vaddl.u16 q8, d14, d26 \n" + "vaddl.u16 q9, d15, d27 \n" + // 32->16 q2 >> 7 + "vshrn.u32 d10, q2, #7 \n" + "vshrn.u32 d11, q3, #7 \n" + "vshrn.u32 d14, q4, #7 \n" + "vshrn.u32 d15, q10, #7 \n" + "vshrn.u32 d24, q6, #7 \n" + "vshrn.u32 d25, q11, #7 \n" + "vshrn.u32 d26, q8, #7 \n" + "vshrn.u32 d27, q9, #7 \n" + // 16->8 + "vmovn.u16 d4, q5 \n" + "vmovn.u16 d5, q7 \n" + "vmovn.u16 d6, q12 \n" + "vmovn.u16 d7, q13 \n" + "subs %[cnt], #1 \n" + // store + "vst1.8 d4, [%[outr0]]! \n" + "vst1.8 d5, [%[outr1]]! \n" + "vst1.8 d6, [%[outr2]]! \n" + "vst1.8 d7, [%[outr3]]! \n" + "bne 1b \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outr0] "+r"(outr0), + [outr1] "+r"(outr1), + [outr2] "+r"(outr2), + [outr3] "+r"(outr3), + [cnt] "+r"(cnt) + : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array) + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + for (; j < remain_pro; j++) { + *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7; + *outr1++ = (inptr1[0] * b + inptr1[1] * g + inptr1[2] * r) >> 7; + *outr2++ = (inptr2[0] * b + inptr2[1] * g + inptr2[2] * r) >> 7; + *outr3++ = (inptr3[0] * b + inptr3[1] * g + inptr3[2] * r) >> 7; + inptr0 += 4; + inptr1 += 4; + inptr2 += 4; + inptr3 += 4; + } + } + for (; i < srch; i++) { + int j = 0; + const uint8_t* inptr0 = src + i * win; + uint8_t* outr0 = dst + i * srcw; + for (j = 0; j < cnt_pro; j++) { + uint8x8x4_t y0 = vld4_u8(inptr0); // d8 = y0y3y6y9.. d9 = y1y4y7...y + uint16x8_t val0 = vmull_u8(y0.val[0], vb); + + uint16x8_t val0_1 = vmull_u8(y0.val[1], vg); + + val0 = vmlal_u8(val0, y0.val[2], vr); + + uint32x4_t v0_sum0 = vaddl_u16(vget_low_u16(val0_1), vget_low_u16(val0)); + uint32x4_t v0_sum1 = + vaddl_u16(vget_high_u16(val0_1), vget_high_u16(val0)); + + uint16x4_t v0_sum0_16 = vshrn_n_u32(v0_sum0, 7); + uint16x4_t v0_sum1_16 = vshrn_n_u32(v0_sum1, 7); + + uint16x8_t v0_sum = vcombine_u16(v0_sum0_16, v0_sum1_16); + + uint8x8_t vout0 = vmovn_u16(v0_sum); + + inptr0 += 32; + vst1_u8(outr0, vout0); + outr0 += 8; + } + for (; j < srcw; j++) { + *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7; + inptr0 += 4; + } + } +} +/* 采用CV_GRAY2BGR,转换公式B = G = R = Gray 采用CV_GRAY2RGB,转换公式R = G = B = Gray gray2bgr, gray2rgb @@ -1091,6 +1375,22 @@ void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } } +/* +采用CV_GRAY2BGRA,转换公式B = G = R = Gray A=255 +采用CV_GRAY2RGBA,转换公式R = G = B = Gray A=255 +gray2bgra, gray2rgba +*/ +void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src; + *dst++ = *src; + *dst++ = *src; + *dst++ = 255; + src++; + } + } +} // bgr2bgra, rgb2rgba void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { diff --git a/lite/utils/cv/image_flip.cc b/lite/utils/cv/image_flip.cc index fd84691a2d1d244350f40238bc137d5d159ba62b..f535c858e4dddcd04a0ce8cfa7a727356df34d64 100644 --- a/lite/utils/cv/image_flip.cc +++ b/lite/utils/cv/image_flip.cc @@ -19,6 +19,23 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageFlip::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip_param) { + if (srcFormat == GRAY) { + flip_hwc1(src, dst, srcw, srch, flip_param); + } else if (srcFormat == BGR || srcFormat == RGB) { + flip_hwc3(src, dst, srcw, srch, flip_param); + } else if (srcFormat == BGRA || srcFormat == RGBA) { + flip_hwc4(src, dst, srcw, srch, flip_param); + } else { + printf("this srcFormat: %d does not support! \n", srcFormat); + return; + } +} // gray void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in); void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in); @@ -43,6 +60,9 @@ void flip_hwc1(const uint8_t* src, flip_hwc1_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc1_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } @@ -57,6 +77,9 @@ void flip_hwc3(const uint8_t* src, flip_hwc3_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc3_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } @@ -71,6 +94,9 @@ void flip_hwc4(const uint8_t* src, flip_hwc4_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc4_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } /* diff --git a/lite/utils/cv/image_flip.h b/lite/utils/cv/image_flip.h index 5e513324a179423ec1d008d6e6cd33d29a79c095..7215b9494a36d50cba787be7e53253d704bde8bd 100644 --- a/lite/utils/cv/image_flip.h +++ b/lite/utils/cv/image_flip.h @@ -21,6 +21,15 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageFlip { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip_param); +}; void flip_hwc1( const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param); void flip_hwc3( diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc index 8b0b8aa17d3ced769c7ff606e9ba5fe78208b3d7..cd02a2cf4bd0bdfa0f2c45ed2cf0b1ead803480c 100644 --- a/lite/utils/cv/image_resize.cc +++ b/lite/utils/cv/image_resize.cc @@ -38,6 +38,15 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageResize::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth) { + resize(src, dst, srcFormat, srcw, srch, dstw, dsth); +} void compute_xy(int srcw, int srch, int dstw, diff --git a/lite/utils/cv/image_resize.h b/lite/utils/cv/image_resize.h index e2e399f542c3b00eaf6a3b09f6315b38518f409f..f11f7b5d93159509ca9069f409335e6530060383 100644 --- a/lite/utils/cv/image_resize.h +++ b/lite/utils/cv/image_resize.h @@ -39,6 +39,16 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageResize { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth); +}; void resize(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc index 04ba84076685f89c376203d69ea631afe03671ec..98e61fb444aad691d28ae2116dbbd5743e20e481 100644 --- a/lite/utils/cv/image_rotate.cc +++ b/lite/utils/cv/image_rotate.cc @@ -19,6 +19,26 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageRotate::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + float degree) { + if (degree != 90 && degree != 180 && degree != 270) { + printf("this degree: %f not support \n", degree); + } + if (srcFormat == GRAY) { + rotate_hwc1(src, dst, srcw, srch, degree); + } else if (srcFormat == BGR || srcFormat == RGB) { + rotate_hwc3(src, dst, srcw, srch, degree); + } else if (srcFormat == BGRA || srcFormat == RGBA) { + rotate_hwc4(src, dst, srcw, srch, degree); + } else { + printf("this srcFormat: %d does not support! \n", srcFormat); + return; + } +} // gray void rotate_hwc1_90( const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); @@ -50,6 +70,9 @@ void rotate_hwc1( rotate_hwc1_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc1_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } @@ -61,6 +84,9 @@ void rotate_hwc3( rotate_hwc3_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc3_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } @@ -72,6 +98,9 @@ void rotate_hwc4( rotate_hwc4_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc4_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } #ifdef __aarch64__ @@ -578,6 +607,7 @@ void rotate_hwc1_90(const uint8_t* src, int stride_h = 4 * w_in; int stride_h_w = 4 * w_in - 8; int stride_out = 4 * w_out; + int ww = w_out - 8; #pragma omp parallel for for (i = 0; i < h_in - 7; i += 8) { const uint8_t* inptr0 = src + i * w_in; @@ -586,7 +616,7 @@ void rotate_hwc1_90(const uint8_t* src, const uint8_t* inptr3 = inptr2 + w_in; int j = 0; for (; j < w_in - 7; j += 8) { - uint8_t* outptr0 = dst + j * w_out + i; + uint8_t* outptr0 = dst + j * w_out + (ww - i); uint8_t* outptr1 = outptr0 + w_out; uint8_t* outptr2 = outptr1 + w_out; uint8_t* outptr3 = outptr2 + w_out; @@ -648,7 +678,7 @@ void rotate_hwc1_90(const uint8_t* src, const uint8_t* inptr6 = inptr5 + w_in; const uint8_t* inptr7 = inptr6 + w_in; for (; j < w_in; j++) { - uint8_t* outptr = dst + j * w_out + i; + uint8_t* outptr = dst + j * w_out + ww - i; *outptr++ = *inptr0++; *outptr++ = *inptr1++; *outptr++ = *inptr2++; @@ -659,10 +689,11 @@ void rotate_hwc1_90(const uint8_t* src, *outptr++ = *inptr7++; } } + ww = w_out - 1; for (; i < h_in; i++) { const uint8_t* inptr0 = src + i * w_in; for (int j = 0; j < w_in; j++) { - uint8_t* outptr0 = dst + j * w_out + i; + uint8_t* outptr0 = dst + j * w_out + ww - i; *outptr0 = *inptr0++; } } @@ -693,9 +724,9 @@ void rotate_hwc1_180(const uint8_t* src, const uint8_t* inptr3 = inptr2 + w_in; uint8_t* outptr0 = dst + (h_in - i) * w_out - stride_w; // last - uint8_t* outptr1 = outptr0 + w_out; - uint8_t* outptr2 = outptr1 + w_out; - uint8_t* outptr3 = outptr2 + w_out; + uint8_t* outptr1 = outptr0 - w_out; + uint8_t* outptr2 = outptr1 - w_out; + uint8_t* outptr3 = outptr2 - w_out; if (i + 3 >= h_in) { uint8_t* ptr = zerobuff + w_in - stride_w; diff --git a/lite/utils/cv/image_rotate.h b/lite/utils/cv/image_rotate.h index 8335fca28051c3ba0ae5070464c32d5e804361f4..8e04a3f5244ab5740f9ee1b0e3586cdcea7aa32a 100644 --- a/lite/utils/cv/image_rotate.h +++ b/lite/utils/cv/image_rotate.h @@ -16,10 +16,20 @@ #include #include +#include "lite/utils/cv/paddle_image_preprocess.h" namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageRotate { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + float degree); +}; void rotate_hwc1( const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree); void rotate_hwc3( diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc index f18047556874a82d28c5964a1b5fd2fa8284c814..c46811a046a19a50592097fb987280ad19608193 100644 --- a/lite/utils/cv/paddle_image_preprocess.cc +++ b/lite/utils/cv/paddle_image_preprocess.cc @@ -25,7 +25,6 @@ namespace paddle { namespace lite { namespace utils { namespace cv { - #define PI 3.14159265f #define Degrees2Radians(degrees) ((degrees) * (SK_ScalarPI / 180)) #define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI)) @@ -38,7 +37,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, this->dstFormat_ = dstFormat; this->transParam_ = param; } -void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) { +void ImagePreprocess::imageConvert(const uint8_t* src, uint8_t* dst) { ImageConvert img_convert; img_convert.choose(src, dst, @@ -48,10 +47,10 @@ void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) { this->transParam_.ih); } -void ImagePreprocess::imageCovert(const uint8_t* src, - uint8_t* dst, - ImageFormat srcFormat, - ImageFormat dstFormat) { +void ImagePreprocess::imageConvert(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat) { ImageConvert img_convert; img_convert.choose(src, dst, @@ -68,7 +67,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, int srch, int dstw, int dsth) { - resize(src, dst, srcFormat, srcw, srch, dstw, dsth); + ImageResize img_resize; + img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth); } void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { @@ -77,7 +77,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { int dstw = this->transParam_.ow; int dsth = this->transParam_.oh; auto srcFormat = this->dstFormat_; - resize(src, dst, srcFormat, srcw, srch, dstw, dsth); + ImageResize img_resize; + img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth); } void ImagePreprocess::imageRotate(const uint8_t* src, @@ -86,19 +87,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, int srcw, int srch, float degree) { - if (degree != 90 && degree != 180 && degree != 270) { - printf("this degree: %f not support \n", degree); - } - if (srcFormat == GRAY) { - rotate_hwc1(src, dst, srcw, srch, degree); - } else if (srcFormat == BGR || srcFormat == RGB) { - rotate_hwc3(src, dst, srcw, srch, degree); - } else if (srcFormat == BGRA || srcFormat == RGBA) { - rotate_hwc4(src, dst, srcw, srch, degree); - } else { - printf("this srcFormat: %d does not support! \n", srcFormat); - return; - } + ImageRotate img_rotate; + img_rotate.choose(src, dst, srcFormat, srcw, srch, degree); } void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) { @@ -106,10 +96,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) { auto srch = this->transParam_.oh; auto srcFormat = this->dstFormat_; auto degree = this->transParam_.rotate_param; - if (degree != 90 && degree != 180 && degree != 270) { - printf("this degree: %f not support \n", degree); - } - ImagePreprocess::imageRotate(src, dst, srcFormat, srcw, srch, degree); + ImageRotate img_rotate; + img_rotate.choose(src, dst, srcFormat, srcw, srch, degree); } void ImagePreprocess::imageFlip(const uint8_t* src, @@ -118,16 +106,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, int srcw, int srch, FlipParam flip_param) { - if (srcFormat == GRAY) { - flip_hwc1(src, dst, srcw, srch, flip_param); - } else if (srcFormat == BGR || srcFormat == RGB) { - flip_hwc3(src, dst, srcw, srch, flip_param); - } else if (srcFormat == BGRA || srcFormat == RGBA) { - flip_hwc4(src, dst, srcw, srch, flip_param); - } else { - printf("this srcFormat: %d does not support! \n", srcFormat); - return; - } + ImageFlip img_flip; + img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param); } void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) { @@ -135,7 +115,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) { auto srch = this->transParam_.oh; auto srcFormat = this->dstFormat_; auto flip_param = this->transParam_.flip_param; - ImagePreprocess::imageFlip(src, dst, srcFormat, srcw, srch, flip_param); + ImageFlip img_flip; + img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param); } void ImagePreprocess::image2Tensor(const uint8_t* src, diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h index 5a46a9e48e8202fe29ec9fc7d950ccf15920cc32..a12c0d11f067fc3e807682f9a213d3024def97e0 100644 --- a/lite/utils/cv/paddle_image_preprocess.h +++ b/lite/utils/cv/paddle_image_preprocess.h @@ -19,6 +19,7 @@ #include #include "lite/api/paddle_api.h" #include "lite/api/paddle_place.h" + namespace paddle { namespace lite { namespace utils { @@ -37,9 +38,9 @@ enum ImageFormat { }; // flip enum enum FlipParam { - X = 0, // flip along the X axis - Y, // flip along the Y axis - XY // flip along the XY axis + XY = -1, // flip along the XY axis + X = 0, // flip along the X axis + Y // flip along the Y axis }; // transform param typedef struct { @@ -69,11 +70,12 @@ class ImagePreprocess { * BGR(RGB)and BGRA(RGBA) transform, * BGR(RGB)and RGB(BGR) transform, * BGR(RGB)and RGBA(BGRA) transform, - * BGR(RGB)and GRAY transform, + * BGR(RGB) and GRAY transform, + * BGRA(RGBA) and GRAY transform, * param src: input image data * param dst: output image data */ - void imageCovert(const uint8_t* src, uint8_t* dst); + void imageConvert(const uint8_t* src, uint8_t* dst); /* * image color convert * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), @@ -81,6 +83,7 @@ class ImagePreprocess { * BGR(RGB)and RGB(BGR) transform, * BGR(RGB)and RGBA(BGRA) transform, * BGR(RGB)and GRAY transform, + * BGRA(RGBA) and GRAY transform, * param src: input image data * param dst: output image data * param srcFormat: input image image format support: GRAY, NV12(NV21), @@ -88,10 +91,10 @@ class ImagePreprocess { * param dstFormat: output image image format, support GRAY, BGR(RGB) and * BGRA(RGBA) */ - void imageCovert(const uint8_t* src, - uint8_t* dst, - ImageFormat srcFormat, - ImageFormat dstFormat); + void imageConvert(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat); /* * image resize, use bilinear method * support image format: 1-channel image (egs: GRAY, 2-channel image (egs: @@ -171,7 +174,8 @@ class ImagePreprocess { FlipParam flip_param); /* * change image data to tensor data - * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC + * and * NCHW * param src: input image data * param dstTensor: output tensor data @@ -186,7 +190,8 @@ class ImagePreprocess { float* scales); /* * change image data to tensor data - * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC + * and * NCHW * param src: input image data * param dstTensor: output tensor data diff --git a/lite/utils/env.h b/lite/utils/env.h old mode 100755 new mode 100644 diff --git a/mobile/src/common/log.h b/mobile/src/common/log.h index 69654c505d234709d6c3119be346cefaf82c04a9..3b42188b62278c0acde41d52d68cc4b48ee6cda9 100644 --- a/mobile/src/common/log.h +++ b/mobile/src/common/log.h @@ -80,7 +80,6 @@ static const char *ANDROID_LOG_TAG = #endif enum LogLevel { - kNO_LOG, kLOG_ERROR, kLOG_WARNING, kLOG_INFO, @@ -89,15 +88,16 @@ enum LogLevel { kLOG_DEBUG1, kLOG_DEBUG2, kLOG_DEBUG3, - kLOG_DEBUG4 + kLOG_DEBUG4, + kNO_LOG, }; // log level static LogLevel log_level = kLOG_DEBUG4; -static std::vector logs{"NO ", "ERROR ", "WARNING", "INFO ", - "VERBOSE", "DEBUG ", "DEBUG1 ", "DEBUG2 ", - "DEBUG3 ", "DEBUG4 "}; +static std::vector logs{"ERROR ", "WARNING", "INFO ", "VERBOSE", + "DEBUG ", "DEBUG1 ", "DEBUG2 ", "DEBUG3 ", + "DEBUG4 ", "NO "}; struct ToLog; struct Print; @@ -217,7 +217,6 @@ struct ToLog { #define ANDROIDLOGV(...) enum LogLevel { - kNO_LOG, kLOG_ERROR, kLOG_WARNING, kLOG_INFO, @@ -226,7 +225,8 @@ enum LogLevel { kLOG_DEBUG1, kLOG_DEBUG2, kLOG_DEBUG3, - kLOG_DEBUG4 + kLOG_DEBUG4, + kNO_LOG }; struct ToLog; diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/framework/cl/cl_deleter.h b/mobile/src/framework/cl/cl_deleter.h index 55af631174ae9f2a7815c2da35ebadda3ebfd9e9..731e5de663cd7af63a5a981dfb1d46f82101d6b8 100644 --- a/mobile/src/framework/cl/cl_deleter.h +++ b/mobile/src/framework/cl/cl_deleter.h @@ -15,45 +15,51 @@ limitations under the License. */ #pragma once #include "CL/cl.h" - +#include "common/log.h" struct CLKernelDeleter { template void operator()(T *clKernelObj) { - clReleaseKernel(clKernelObj); + const cl_int status = clReleaseKernel(clKernelObj); + LOG(paddle_mobile::kNO_LOG) << "clReleaseKernel status: " << status; } }; struct CLMemDeleter { template void operator()(T *clMemObj) { - clReleaseMemObject(clMemObj); + const cl_int status = clReleaseMemObject(clMemObj); + LOG(paddle_mobile::kNO_LOG) << "CLMemDeleter status: " << status; } }; struct CLEventDeleter { template void operator()(T *clEventObj) { - clReleaseEvent(clEventObj); + const cl_int status = clReleaseEvent(clEventObj); + LOG(paddle_mobile::kNO_LOG) << "CLEventDeleter status: " << status; } }; struct CLCommQueueDeleter { template void operator()(T *clQueueObj) { - clReleaseCommandQueue(clQueueObj); + const cl_int status = clReleaseCommandQueue(clQueueObj); + LOG(paddle_mobile::kNO_LOG) << "CLCommQueueDeleter status: " << status; } }; struct CLContextDeleter { template void operator()(T *clContextObj) { - clReleaseContext(clContextObj); + const cl_int status = clReleaseContext(clContextObj); + LOG(paddle_mobile::kNO_LOG) << "CLContextDeleter status: " << status; } }; struct CLProgramDeleter { template void operator()(T *clProgramObj) { - clReleaseProgram(clProgramObj); + const cl_int status = clReleaseProgram(clProgramObj); + LOG(paddle_mobile::kNO_LOG) << "CLProgramDeleter status: " << status; } }; diff --git a/mobile/src/framework/cl/cl_engine.cpp b/mobile/src/framework/cl/cl_engine.cpp index c39ae00b00c0b240b15b7d98805d097b2af50ef4..e8a8361eac71083d126b9ca4c22a098c6a9192fe 100644 --- a/mobile/src/framework/cl/cl_engine.cpp +++ b/mobile/src/framework/cl/cl_engine.cpp @@ -23,9 +23,11 @@ namespace paddle_mobile { namespace framework { bool CLEngine::Init() { + LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init()"; if (initialized_) { return true; } + LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init() ..."; cl_int status; bool is_setplatform_success = SetPlatform(); bool is_setcldeviceid_success = SetClDeviceId(); @@ -53,12 +55,14 @@ bool CLEngine::SetPlatform() { return false; } /**For clarity, choose the first available platform. */ + LOG(paddle_mobile::kNO_LOG) << "numPlatforms: " << numPlatforms; if (numPlatforms > 0) { cl_platform_id *platforms = reinterpret_cast( malloc(numPlatforms * sizeof(cl_platform_id))); status = clGetPlatformIDs(numPlatforms, platforms, NULL); platform_ = platforms[0]; free(platforms); + LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_; return status == CL_SUCCESS; } @@ -67,70 +71,21 @@ bool CLEngine::SetPlatform() { bool CLEngine::SetClDeviceId() { cl_uint numDevices = 0; - devices_ = NULL; + LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_; cl_int status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { return false; } + LOG(paddle_mobile::kNO_LOG) << "numDevices: " << numDevices; + if (numDevices > 0) { - devices_ = reinterpret_cast( - malloc(numDevices * sizeof(cl_device_id))); status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_, NULL); + LOG(paddle_mobile::kNO_LOG) << "devices_[0]" << devices_[0]; return status == CL_SUCCESS; } return false; } - -// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel( -// const std::string &kernel_name) { -// std::unique_ptr<_cl_kernel, clKernel_deleter> kernel( -// clCreateKernel(program_.get(), kernel_name.c_str(), NULL)); -// return std::move(kernel); -//} -// -// bool CLEngine::SetClCommandQueue() { -// cl_int status; -// command_queue_.reset( -// clCreateCommandQueue(context_.get(), devices_[0], 0, &status)); -// return true; -//} - -// bool CLEngine::SetClContext() { -// context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL)); -// return true; -//} - -// bool CLEngine::LoadKernelFromFile(const char *kernel_file) { -// size_t size; -// char *str; -// std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary)); -// -// if (!f.is_open()) { -// return false; -// } -// -// size_t fileSize; -// f.seekg(0, std::fstream::end); -// size = fileSize = (size_t)f.tellg(); -// f.seekg(0, std::fstream::beg); -// str = new char[size + 1]; -// if (!str) { -// f.close(); -// return 0; -// } -// -// f.read(str, fileSize); -// f.close(); -// str[size] = '\0'; -// const char *source = str; -// size_t sourceSize[] = {strlen(source)}; -// program_.reset( -// clCreateProgramWithSource(context_.get(), 1, &source, sourceSize, -// NULL)); -// return true; -//} - } // namespace framework } // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h index 2e21dd9e395354d2bd5e35a648687a6116347caf..2a6362ebc06c1e99a0e26502b4da0883732c9112 100644 --- a/mobile/src/framework/cl/cl_engine.h +++ b/mobile/src/framework/cl/cl_engine.h @@ -57,19 +57,27 @@ class CLLocalWorkSizeInfo { // max number of work-items in local_work_size in dim 2 size_t max_work_item_size2; }; - +inline void ctx_info(const char *errinfo, const void *private_info, size_t cb, + void *user_data) { + fprintf(stderr, "OpenCL Error (via pfn_notify): %s\n", errinfo); +} class CLEngine { public: static CLEngine *Instance(); bool Init(); bool isInitSuccess(); - std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() { + + std::shared_ptr<_cl_context> CreateContext() { + DLOG << "CreateContext ---"; + DLOG << "platform: " << platform_; + DLOG << "devices_[0]: " << devices_[0]; + cl_int status; - cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status); - std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c); + cl_context c = clCreateContext(NULL, 1, devices_, &ctx_info, NULL, &status); + std::shared_ptr<_cl_context> context(c, CLContextDeleter()); CL_CHECK_ERRORS(status); - return std::move(context_ptr); + return std::move(context); } std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue( @@ -84,14 +92,14 @@ class CLEngine { } cl_context getContext() { - if (context_ == nullptr) { + if (context_.get() == nullptr) { context_ = CreateContext(); } return context_.get(); } cl_command_queue getClCommandQueue() { - if (command_queue_ == nullptr) { + if (command_queue_.get() == nullptr) { command_queue_ = CreateClCommandQueue(getContext()); } return command_queue_.get(); @@ -124,9 +132,9 @@ class CLEngine { if (status != CL_SUCCESS || ret_size / sizeof(size_t) < 3) { return CLLocalWorkSizeInfo(0, 0, 0, 0); } - DLOG << max_work_item_sizes[0]; - DLOG << max_work_item_sizes[1]; - DLOG << max_work_item_sizes[2]; + DLOG << " max_work_item_sizes {" << max_work_item_sizes[0] << ", " + << max_work_item_sizes[1] << ", " << max_work_item_sizes[2] << "}"; + localWorkSizeInfo_ = CLLocalWorkSizeInfo(max_work_group_size, max_work_item_sizes[0], max_work_item_sizes[1], max_work_item_sizes[2]); @@ -182,8 +190,8 @@ class CLEngine { cl_program p = clCreateProgramWithSource(context, 1, &source, sourceSize, &status_); - DLOG << " cl kernel from source"; - DLOG << " source size: " << sourceSize[0]; + LOG(kLOG_DEBUG4) << " cl kernel from source"; + LOG(kLOG_DEBUG4) << " source size: " << sourceSize[0]; CL_CHECK_ERRORS(status_); std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p); @@ -216,11 +224,7 @@ class CLEngine { DLOG << " program build error: " << log; } - if (status == CL_SUCCESS) { - return true; - } else { - return false; - } + return status == CL_SUCCESS; } cl_device_id DeviceID(int index = 0) { return devices_[index]; } @@ -239,28 +243,13 @@ class CLEngine { CLLocalWorkSizeInfo localWorkSizeInfo_; - cl_platform_id platform_; - - cl_device_id *devices_; - cl_int status_; - std::string cl_path_; - std::unique_ptr<_cl_program, CLProgramDeleter> program_; - - std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr; - - std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ = - nullptr; - - // bool SetClContext(); - - // bool SetClCommandQueue(); - - // bool LoadKernelFromFile(const char *kernel_file); - - // bool BuildProgram(); bool is_init_success_ = false; + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_; + std::shared_ptr<_cl_context> context_; + cl_device_id devices_[10]; + cl_platform_id platform_; }; } // namespace framework diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h index 893456211d0429701b49d0f0be654beaad16e0e2..db9aa37ae2b7219131b5950e54ec008828f1fc70 100644 --- a/mobile/src/framework/cl/cl_helper.h +++ b/mobile/src/framework/cl/cl_helper.h @@ -36,9 +36,9 @@ class CLHelper { void AddKernel(const std::string &kernel_name, const std::string &file_name, const std::string &options = "") { - DLOG << " begin add kernel "; + LOG(kLOG_DEBUG1) << " begin add kernel "; auto kernel = scope_->GetKernel(kernel_name, file_name, options); - DLOG << " add kernel ing "; + LOG(kLOG_DEBUG1) << " begin add kernel "; kernels.emplace_back(std::move(kernel)); } diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h index d3d48cda8b86b07e76658ef903863268042ab36f..57656c3c6d995f9e9c2b5bb8e921b44310d3bbd5 100644 --- a/mobile/src/framework/cl/cl_image.h +++ b/mobile/src/framework/cl/cl_image.h @@ -87,14 +87,14 @@ class CLImage { PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, " need call SetTensorData first"); - DLOG << " begin init cl image "; + LOG(kNO_LOG) << " begin init cl image "; image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); half_t *image_data = new half_t[product(image_dims_) * 4]; - DLOG << " convert to image"; + LOG(kNO_LOG) << " convert to image"; converter->NCHWToImage(tensor_data_, image_data, tensor_dims_); - DLOG << " end convert to image"; + LOG(kNO_LOG) << " end convert to image"; InitCLImage(context, image_dims_[0], image_dims_[1], image_data); @@ -105,7 +105,7 @@ class CLImage { tensor_data_ = nullptr; image_converter_ = converter; initialized_ = true; - DLOG << " end init cl image"; + LOG(kNO_LOG) << " end init cl image"; } void InitNImage(cl_context context, cl_command_queue command_queue) { @@ -137,9 +137,9 @@ class CLImage { // CLImageConverterFolder(); CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); PADDLE_MOBILE_ENFORCE(!shared_mem_, "do not init mem after shared .") - DLOG << " to get image dims "; + // LOG(kNO_LOG) << " to get image dims "; image_dims_ = normal_converter->InitImageDimInfoWith(dim); - DLOG << " end get image dims " << image_dims_; + // LOG(kNO_LOG) << " end get image dims " << image_dims_; InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); @@ -148,7 +148,7 @@ class CLImage { image_converter_ = normal_converter; cl_event_ = CLEngine::Instance()->CreateEvent(context); initialized_ = true; - DLOG << " end init cl image"; + // LOG(kNO_LOG) << " end init cl image"; } /** * create fake size cl_mem for mem share @@ -169,9 +169,9 @@ class CLImage { InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr); // cheat cl_image they got what they wanted image_dims_ = normal_converter->InitImageDimInfoWith(need_dims); - DLOG << "InitFakeSizeImage ... "; - DLOG << "real_image_dims: " << real_image_dims_; - DLOG << "image_dims_: " << image_dims_; + LOG(kNO_LOG) << "InitFakeSizeImage ... "; + LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; + LOG(kNO_LOG) << "image_dims_: " << image_dims_; PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] && real_image_dims_[1] >= image_dims_[1], "real image is not enough"); @@ -182,7 +182,7 @@ class CLImage { initialized_ = true; shared_mem_ = true; - DLOG << " end init FakeSizeImage"; + LOG(kNO_LOG) << " end init FakeSizeImage"; } /** * init cl mem with a exist cl mem @@ -197,21 +197,21 @@ class CLImage { real_image_dims_ = src.real_image_dims_; image_dims_ = normal_converter->InitImageDimInfoWith(need_dims); - DLOG << "InitWithExistMem ... "; - DLOG << "real_image_dims: " << real_image_dims_; - DLOG << "image_dims_: " << image_dims_; + LOG(kNO_LOG) << "InitWithExistMem ... "; + LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; + LOG(kNO_LOG) << "image_dims_: " << image_dims_; if (real_image_dims_[0] < image_dims_[0] || real_image_dims_[1] < image_dims_[1]) { - DLOG << "real image is not enough!"; - DLOG << "real_image_dims: " << real_image_dims_; - DLOG << "image_dims_: " << image_dims_; + LOG(kNO_LOG) << "real image is not enough!"; + LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; + LOG(kNO_LOG) << "image_dims_: " << image_dims_; } PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] && real_image_dims_[1] >= image_dims_[1], "real image is not enough!"); if (cl_image_ != src.cl_image_) { - cl_image_.reset(src.cl_image_.get(), CLMemDeleter()); + cl_image_ = src.cl_image_; } tensor_dims_ = need_dims; @@ -221,7 +221,7 @@ class CLImage { initialized_ = true; shared_mem_ = true; - DLOG << " end init WithExistMem"; + LOG(kNO_LOG) << " end init WithExistMem"; } void InitConv2dTransposeFilterCLImage(cl_context context, @@ -233,18 +233,6 @@ class CLImage { InitCLImage(context, command_queue, converter); } - /*! The internal of two tensors share the same memory block. */ - inline CLImage &ShareHolderWith(const CLImage &src) { - PADDLE_MOBILE_ENFORCE( - src.cl_image_ != nullptr, - "Tensor holds no memory. Call Tensor::mutable_data first.") - - if (cl_image_ != src.cl_image_) { - cl_image_.reset(src.cl_image_.get(), CLMemDeleter()); - } - return *this; - } - cl_mem GetCLImage() const { return cl_image_.get(); } const DDim &ImageDims() const { return image_dims_; } diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h index 643ce32b57616305da0c581d6d50dfcbbc4f1b1d..49e705e5a0a7f401954bca9719bfdad4c7065081 100644 --- a/mobile/src/framework/cl/cl_scope.h +++ b/mobile/src/framework/cl/cl_scope.h @@ -35,30 +35,27 @@ namespace framework { class CLScope { public: - CLScope() { - CLEngine *engine = CLEngine::Instance(); - context_ = engine->getContext(); - command_queue_ = engine->getClCommandQueue(); - localWorkSizeInfo_ = engine->getLocalWorkSizeInfo(); - } + CLScope() {} - cl_command_queue CommandQueue() { return command_queue_; } + cl_command_queue CommandQueue() { + return CLEngine::Instance()->getClCommandQueue(); + } std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel( const std::string &kernel_name, const std::string &file_name, const std::string &options) { - DLOG << " to get program " << file_name; + LOG(kLOG_DEBUG2) << " to get program " << file_name; auto program = Program(file_name, kernel_name, options); - DLOG << " end get program ~ "; - DLOG << " to create kernel: " << kernel_name; + LOG(kLOG_DEBUG2) << " end get program ~ "; + LOG(kLOG_DEBUG2) << " to create kernel: " << kernel_name; std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( clCreateKernel(program, kernel_name.c_str(), &status_)); CL_CHECK_ERRORS(status_); - DLOG << " end create kernel ~ "; + LOG(kLOG_DEBUG2) << " end create kernel ~ "; return std::move(kernel); } - cl_context Context() { return context_; } + cl_context Context() { return CLEngine::Instance()->getContext(); } cl_program Program(const std::string &file_name, const std::string &kernel_name, @@ -79,11 +76,13 @@ class CLScope { std::string header(header_it->second.begin(), header_it->second.end()); source = header + "\n" + source; auto program = CLEngine::Instance()->CreateProgramWithSource( - context_, source.c_str()); + CLEngine::Instance()->getContext(), source.c_str()); - DLOG << " --- begin build program -> " << program_key << " --- "; + LOG(kLOG_DEBUG3) << " --- begin build program -> " << program_key + << " --- "; CLEngine::Instance()->BuildProgram(program.get(), options); - DLOG << " --- end build program -> " << program_key << " --- "; + LOG(kLOG_DEBUG3) << " --- end build program -> " << program_key + << " --- "; programs_[program_key] = std::move(program); return programs_[program_key].get(); @@ -97,19 +96,23 @@ class CLScope { return it->second.get(); } auto program = CLEngine::Instance()->CreateProgramWith( - context_, + CLEngine::Instance()->getContext(), CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name); - DLOG << " --- begin build program -> " << program_key << " --- "; + LOG(kLOG_DEBUG3) << " --- begin build program ele-> " << program_key + << " --- "; CLEngine::Instance()->BuildProgram(program.get(), options); - DLOG << " --- end build program -> " << program_key << " --- "; + LOG(kLOG_DEBUG3) << " --- end build program ele-> " << program_key + << " --- "; programs_[program_key] = std::move(program); return programs_[program_key].get(); } } - CLLocalWorkSizeInfo LocalWorkSizeInfo() { return localWorkSizeInfo_; } + CLLocalWorkSizeInfo LocalWorkSizeInfo() { + return CLEngine::Instance()->getLocalWorkSizeInfo(); + } size_t KernelWorkSize(cl_kernel kernel) { size_t kernel_work_size = CLEngine::Instance()->GetKernelWorkSize(kernel); return kernel_work_size; @@ -117,12 +120,9 @@ class CLScope { private: cl_int status_; - cl_context context_; - cl_command_queue command_queue_; std::unordered_map> programs_; - CLLocalWorkSizeInfo localWorkSizeInfo_; }; } // namespace framework diff --git a/mobile/src/framework/context.h b/mobile/src/framework/context.h index 944d54cc499f2a3c4fcad5c2fb0dfc4fe9bcac1d..18e40311bc2a5d555bb02cf0eb7af6356cbbf0b0 100644 --- a/mobile/src/framework/context.h +++ b/mobile/src/framework/context.h @@ -44,15 +44,13 @@ namespace framework { struct CPUContext { private: CPUContext(); - virtual ~CPUContext() {} public: + ~CPUContext() {} + static CPUContext* Context() { - static CPUContext* ctx = nullptr; - if (ctx == nullptr) { - ctx = new CPUContext(); - } - return ctx; + static CPUContext ctx; + return &ctx; } void set_thread_num(int thread_num, diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp index d03cefe59a221093d4e5fb4e86273b3007097d9f..cda5c5522c961c70fc15bf76fcd650a17bb76835 100644 --- a/mobile/src/framework/executor.cpp +++ b/mobile/src/framework/executor.cpp @@ -80,7 +80,7 @@ Executor::Executor(const Program &program, std::vector> ops = block_desc->Ops(); for (int j = 0; j < ops.size(); ++j) { std::shared_ptr op_desc = ops[j]; - DLOG << "create op: " << op_desc->Type(); + LOG(kLOG_INFO) << "create op[" << j << "]: " << op_desc->Type(); auto op_handler = OpRegistry::CreateOp( op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(), @@ -111,7 +111,8 @@ Executor::Executor(const Program &program, clock_gettime(CLOCK_MONOTONIC, &ts); profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; #endif - DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type(); + LOG(kLOG_INFO) << "Initialize op[" << count++ + << "]: " << op_handler->Type(); if (op_handler->Type() == "feed" || op_handler->Type() == "fetch") { op_handler->setPrePostType(config_.pre_post_type); } @@ -1015,7 +1016,7 @@ void Executor::InitMemory() { const TensorDesc &desc = var_desc->Tensor_desc(); // DDim ddim = make_ddim(desc.Dims()); DDim ddim = cl_image->dims(); - DLOG << var_desc->Name(); + LOG(kLOG_DEBUG1) << "init image of " << var_desc->Name(); cl_image->InitEmptyImage(context, command_queue, ddim); } } diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp index 34cf6253cb4571c3b52fe61161cba3e140eb0110..31274743f8b1d4b3d8195526e1ae77129c2729bb 100644 --- a/mobile/src/framework/loader.cpp +++ b/mobile/src/framework/loader.cpp @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "framework/loader.h" +#include #include "framework/lod_tensor.h" #include "framework/program/program-optimize/program_optimize.h" @@ -173,7 +174,7 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { rewind(fp); DLOG << "model size: " << size; - + PADDLE_MOBILE_ENFORCE(size > 0, "model size should > 0") *out = reinterpret_cast(malloc(size)); size_t cur_len = 0; diff --git a/mobile/src/framework/operator.cpp b/mobile/src/framework/operator.cpp index 402512c7237be0ca26470361cc16369bd97f7758..a091a49b35203445cda48b2387413193079ecd5e 100644 --- a/mobile/src/framework/operator.cpp +++ b/mobile/src/framework/operator.cpp @@ -62,31 +62,39 @@ void OperatorBase::Run() { DLOG << "-------------" << type_ << "----------------------------"; vector input_keys = GetInputKeys(); for (const auto key : input_keys) { - auto var_vec_in = inputs_.at(key); - for (int i = 0; i < var_vec_in.size(); ++i) { - auto var = this->scope_->FindVar(var_vec_in[i]); - if (var->IsInitialized() && - var->template IsType()) { - const Tensor *tensor = var->template Get(); - if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; + if (inputs_.count(key) > 0) { + auto var_vec_in = inputs_.at(key); + for (int i = 0; i < var_vec_in.size(); ++i) { + auto var = this->scope_->FindVar(var_vec_in[i]); + if (var->IsInitialized() && + var->template IsType()) { + const Tensor *tensor = var->template Get(); + if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; #ifdef PADDLE_MOBILE_FPGA - DLOG << var_vec_in[i]; + DLOG << var_vec_in[i]; #endif + } } + } else { + DLOG << "did not find key (" << key << ") in inputs_"; } } for (const auto key : GetOutKeys()) { - auto var_vec_out = outputs_.at(key); - for (int i = 0; i < var_vec_out.size(); ++i) { - auto var = scope_->FindVar(var_vec_out[i]); - if (var->IsInitialized() && - var->template IsType()) { - const Tensor *tensor = var->template Get(); - if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; + if (outputs_.count(key) > 0) { + auto var_vec_out = outputs_.at(key); + for (int i = 0; i < var_vec_out.size(); ++i) { + auto var = scope_->FindVar(var_vec_out[i]); + if (var->IsInitialized() && + var->template IsType()) { + const Tensor *tensor = var->template Get(); + if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; #ifdef PADDLE_MOBILE_FPGA - DLOG << var_vec_out[i]; + DLOG << var_vec_out[i]; #endif + } } + } else { + DLOG << "did not find key (" << key << ") in outputs_"; } } #endif @@ -100,27 +108,37 @@ void OperatorBase::Run() { DLOG << "-------------" << type_ << "----------------------------"; vector input_keys = GetInputKeys(); for (const auto key : input_keys) { - auto var_vec_in = inputs_.at(key); - for (int i = 0; i < var_vec_in.size(); ++i) { - auto var = scope_->FindVar(var_vec_in[i]); - if (var->IsInitialized() && var->template IsType()) { - const CLImage *cl_image = var->template Get(); - if (cl_image) { - DLOG << type_ << " input- " << key << "=" << *cl_image; + if (inputs_.count(key) > 0) { + auto var_vec_in = inputs_.at(key); + for (int i = 0; i < var_vec_in.size(); ++i) { + auto var = scope_->FindVar(var_vec_in[i]); + if (var->IsInitialized() && + var->template IsType()) { + const CLImage *cl_image = var->template Get(); + if (cl_image) { + DLOG << type_ << " input- " << key << "=" << *cl_image; + } } } + } else { + DLOG << "did not find key (" << key << ") in inputs_"; } } for (const auto key : GetOutKeys()) { - auto var_vec_out = outputs_.at(key); - for (int i = 0; i < var_vec_out.size(); ++i) { - auto var = scope_->FindVar(var_vec_out[i]); - if (var->IsInitialized() && var->template IsType()) { - const CLImage *cl_image = var->template Get(); - if (cl_image) { - DLOG << type_ << " output- " << key << "=" << *cl_image; + if (outputs_.count(key) > 0) { + auto var_vec_out = outputs_.at(key); + for (int i = 0; i < var_vec_out.size(); ++i) { + auto var = scope_->FindVar(var_vec_out[i]); + if (var->IsInitialized() && + var->template IsType()) { + const CLImage *cl_image = var->template Get(); + if (cl_image) { + DLOG << type_ << " output- " << key << "=" << *cl_image; + } } } + } else { + DLOG << "did not find key (" << key << ") in outputs_"; } } #endif diff --git a/mobile/src/io/opencl_interface.cpp b/mobile/src/io/opencl_interface.cpp index 1df5b48339b9b8d82c3e0cc4452c1f7876458ece..636cd1b760801497932606a1cfaae047ed85a994 100644 --- a/mobile/src/io/opencl_interface.cpp +++ b/mobile/src/io/opencl_interface.cpp @@ -28,8 +28,26 @@ cl_command_queue getClCommandQueue() { } bool isInitSuccess() { + prepareOpenclRuntime(); return framework::CLEngine::Instance()->isInitSuccess(); } +bool prepareOpenclRuntime() { +#ifdef PREPARE_OPENCL_RUNTIME + DLOG << "cl runtime prepared. "; + cl_uint numPlatforms; // the NO. of platforms + cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); + if (status == CL_SUCCESS) { + if (numPlatforms > 0) { + cl_platform_id *platforms = reinterpret_cast( + malloc(numPlatforms * sizeof(cl_platform_id))); + status = clGetPlatformIDs(numPlatforms, platforms, NULL); + free(platforms); + } + } +#endif + return true; +} + } // namespace paddle_mobile #endif diff --git a/mobile/src/io/opencl_interface.h b/mobile/src/io/opencl_interface.h index f1039f1373df6b65a5fc7f4e01279badfaa40307..6a3608790a98638e207fd20dd6f9f05ea54d9e3d 100644 --- a/mobile/src/io/opencl_interface.h +++ b/mobile/src/io/opencl_interface.h @@ -21,6 +21,7 @@ namespace paddle_mobile { cl_context getContext(); cl_command_queue getClCommandQueue(); bool isInitSuccess(); +bool prepareOpenclRuntime(); } // namespace paddle_mobile diff --git a/mobile/src/io/paddle_mobile.h b/mobile/src/io/paddle_mobile.h index 8b8f0683abd12d9516e2a2cb09078241c2b7944e..8c40b0696ad0f4daf782a71a1816b66a3a2c95df 100644 --- a/mobile/src/io/paddle_mobile.h +++ b/mobile/src/io/paddle_mobile.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "io/paddle_inference_api.h" #ifdef PADDLE_MOBILE_CL #include "framework/cl/cl_engine.h" +#include "io/opencl_interface.h" #endif namespace paddle_mobile { @@ -34,16 +35,24 @@ template class PaddleMobile { public: explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) { -#ifndef PADDLE_MOBILE_CL bool is_gpu = std::is_same, Device>::value; +#ifndef PADDLE_MOBILE_CL PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on"); +#else + if (is_gpu) { + prepareOpenclRuntime(); + } #endif } PaddleMobile() { -#ifndef PADDLE_MOBILE_CL bool is_gpu = std::is_same, Device>::value; +#ifndef PADDLE_MOBILE_CL PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on"); +#else + if (is_gpu) { // recheck when run cpu in with opencl. + prepareOpenclRuntime(); + } #endif } virtual ~PaddleMobile() { Clear(); } diff --git a/mobile/src/operators/expand_op.cpp b/mobile/src/operators/expand_op.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/expand_op.h b/mobile/src/operators/expand_op.h old mode 100755 new mode 100644 diff --git a/mobile/src/operators/grid_sampler_op.cpp b/mobile/src/operators/grid_sampler_op.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/grid_sampler_op.h b/mobile/src/operators/grid_sampler_op.h old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl index 4895c07d201283d2b82e52209baf2baa896bc329..b7f4d16c3bb54b7f28d379e38724c5de8cf9dd06 100644 --- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl @@ -110,4 +110,22 @@ __kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias, half4 in = read_imageh(input, sampler, coords); half4 output = mad(in, biase, 0); write_imageh(outputImage, coords, output); +} + +__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords_bias); + half4 output = in * biase; + write_imageh(outputImage, coords, output); } \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp index 8d66b50a99a6cd07de8dcf32867f1cb3c28d2232..4261681f3ec2b740516a42785bee30dc843b3a71 100644 --- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp @@ -43,7 +43,10 @@ bool ConvTransposeKernel::Init( this->cl_helper_.AddKernel("conv_transpose3x3s2", "conv_transpose_kernel.cl"); } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + param->ExecMode() = ConvTransposeParam::EXEC_CONVTRANS_FLOAT; + param->Filter()->InitConv2dTransposeFilterCLImage( + cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_transpose", "conv_transpose_kernel.cl"); } return true; } @@ -58,6 +61,9 @@ void ConvTransposeKernel::Compute( case ConvTransposeParam::EXEC_CONVTRANS3x3s2_FLOAT: ConvTranspose3x3s2AddBnRelu(&this->cl_helper_, param); break; + case ConvTransposeParam::EXEC_CONVTRANS_FLOAT: + ConvTransposeAddBnRelu(&this->cl_helper_, param); + break; default: PADDLE_MOBILE_THROW_EXCEPTION( "Invalid convolution transpose execute mode %d", param.ExecMode()); diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp index fd5b9e6bc3ffcce5ddde03e575cec0d1649758fc..37034a01899d8246abfa5dcf419637e643eff924 100644 --- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp +++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp @@ -30,16 +30,23 @@ bool ElementwiseMulKernel::Init( if (bias->dims() == param->InputX()->dims()) { DLOG << "init element wise mul"; this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl"); - } else if (bias->dims().size() == 1) { - DLOG << "init channel_mul"; - this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); - } else if (bias->dims().size() == 2) { - // etc. input 1 72 28 28 - // filter 1 72 - DLOG << "init channel_mul_d2"; - this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); } else { - PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet"); + const int bias_dim_size = bias->dims().size(); + if (bias_dim_size == 1) { + DLOG << "init channel_mul"; + this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); + } else if (bias_dim_size == 2) { + // etc. input 1 72 28 28 + // filter 1 72 + DLOG << "init channel_mul_d2"; + this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); + } else if (bias_dim_size == 4) { + DLOG << "init channel_mul_d4"; + this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl"); + } else { + PADDLE_MOBILE_ENFORCE(false, + "element mul not supported this situation yet"); + } } return true; } @@ -71,68 +78,103 @@ void ElementwiseMulKernel::Compute( clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 1) { - DLOG << "channel mul"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 2) { - DLOG << "channel mul d2"; + } else { + const int bias_dim_size = bias->dims().size(); + if (bias_dim_size == 1) { + DLOG << "channel mul"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else if (bias_dim_size == 2) { + DLOG << "channel mul d2"; - // etc. input 1 72 28 28 - // filter 1 72 --> 1 1 1 72 - DLOG << "input->ImageDims(): " << input->ImageDims(); - DLOG << "bias->ImageDims(): " << bias->ImageDims(); - DLOG << "out->ImageDims(): " << output->ImageDims(); + // etc. input 1 72 28 28 + // filter 1 72 --> 1 1 1 72 + DLOG << "input->ImageDims(): " << input->ImageDims(); + DLOG << "bias->ImageDims(): " << bias->ImageDims(); + DLOG << "out->ImageDims(): " << output->ImageDims(); - DLOG << "channel mul d2"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + DLOG << "channel mul d2"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); - // bias->PrintTensor(*bias); - } else { - PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") + // bias->PrintTensor(*bias); + } else if (bias_dim_size == 4) { + DLOG << "channel_mul_d4"; + // etc. input 1 72 28 28 + // filter 1 72 --> 1 1 1 72 + DLOG << "input->ImageDims(): " << input->ImageDims(); + DLOG << "bias->ImageDims(): " << bias->ImageDims(); + DLOG << "out->ImageDims(): " << output->ImageDims(); + + DLOG << "channel mul d2"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") + } } } diff --git a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/expand_kernel.cpp b/mobile/src/operators/kernel/cl/expand_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/expand_kernel.h b/mobile/src/operators/kernel/expand_kernel.h old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/grid_sampler_kernel.h b/mobile/src/operators/kernel/grid_sampler_kernel.h old mode 100755 new mode 100644 diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h index e58159fbb74e7a91a88c3e76f8aa713b679d94b8..f588b9fc79e1fe0a69dd00afe6419e0ef1e2aa5b 100644 --- a/mobile/src/operators/op_param.h +++ b/mobile/src/operators/op_param.h @@ -344,10 +344,14 @@ class OpParam { template static const T GetAttr(const string &key, const AttributeMap &map) { + PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map", + key.c_str()) return ((Attribute)map.at(key)).Get(); } static const std::string GetStringAttr(const string &key, const AttributeMap &map) { + PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map", + key.c_str()) return ((Attribute)map.at(key)).GetString(); } @@ -355,6 +359,10 @@ class OpParam { return map.count(key) > 0; } + static const bool HasVar(const string &key, const VariableNameMap &var_map) { + return var_map.count(key) > 0; + } + template static T *GetVarValue(const string &key, const VariableNameMap &var_map, const Scope &scope) { @@ -2624,6 +2632,7 @@ class ConvTransposeParam : public OpParam { EXEC_DECONV4X4_FLOAT, EXEC_DEPTHWISETRANS_FLOAT, EXEC_CONVTRANS3x3s2_FLOAT, + EXEC_CONVTRANS_FLOAT, }; ExecMode &ExecMode() const { return exec_mode_; } @@ -3100,16 +3109,37 @@ class NearestInterpolationParam : public OpParam { const AttributeMap &attrs, Scope *scope) : OpParam(inputs, outputs, attrs, scope) { input_x_ = InputXFrom(inputs, *scope); - input_outsize_ = InputOutSizeFrom(inputs, *scope); + const bool has_out_size = HasVar("OutSize", inputs); + + if (has_out_size) { + input_outsize_ = InputOutSizeFrom(inputs, *scope); + } + out_ = OutFrom(outputs, *scope); - out_h_ = GetAttr("out_h", attrs); - out_w_ = GetAttr("out_w", attrs); + + if (HasAttr("out_h", attrs)) { + out_h_ = GetAttr("out_h", attrs); + } else if (HasAttr("out_h ", attrs)) { + // some models hurts .... attr with space .. + out_h_ = GetAttr("out_h ", attrs); + } + + if (HasAttr("out_w", attrs)) { + out_w_ = GetAttr("out_w", attrs); + } else if (HasAttr("out_w ", attrs)) { + // some models hurts .... attr with space .. + out_w_ = GetAttr("out_w ", attrs); + } + + LOG(kLOG_DEBUG1) << "out_h_: " << out_h_; + LOG(kLOG_DEBUG1) << "out_w_: " << out_w_; + if (HasAttr("scale", attrs)) { has_scale_ = true; scale_ = GetAttr("scale", attrs); } - DLOG << "has_scale_: " << has_scale_; - DLOG << "scale_: " << scale_; + LOG(kLOG_DEBUG1) << "has_scale_: " << has_scale_; + LOG(kLOG_DEBUG1) << "scale_: " << scale_; } const GType *InputX() const { return input_x_; } const GType *InputOutPutSize() const { return input_outsize_; } diff --git a/mobile/src/pass/memory_optimize_cl.cpp b/mobile/src/pass/memory_optimize_cl.cpp index 355123349d645075fd2ccc37144144da7d332a8f..53bb675f17b2bae9c3954fa57894b8f73fc611fe 100644 --- a/mobile/src/pass/memory_optimize_cl.cpp +++ b/mobile/src/pass/memory_optimize_cl.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef PADDLE_MOBILE_CL #include "pass/memory_optimize_cl.h" #include +#include #include "framework/cl/cl_image.h" #include "framework/lod_tensor.h" namespace paddle_mobile { @@ -79,7 +80,7 @@ void MemoryOptPassCl::operator()( std::vector fetch_var_nodes; for (const auto &op : block->Ops()) { - DLOG << "op_desc->Type(): " << op->Type(); + LOG(kNO_LOG) << "op_desc->Type(): " << op->Type(); for (const auto &outputs : op->GetOutputs()) { for (const auto &output : outputs.second) { // not a persistable and not a exclude one ,then add it to @@ -87,7 +88,7 @@ void MemoryOptPassCl::operator()( if (!IsPersistable(output) && std::find(exclude_var_names.begin(), exclude_var_names.end(), output) == exclude_var_names.end()) { - DLOG << "output: " << output; + LOG(kNO_LOG) << "output: " << output; ClVarNode *node = CreateNode(output); analysis_nodes_.push(node); } @@ -100,7 +101,7 @@ void MemoryOptPassCl::operator()( if (!IsPersistable(input) && std::find(exclude_var_names.begin(), exclude_var_names.end(), input) == exclude_var_names.end()) { - DLOG << "input: " << input; + LOG(kNO_LOG) << "input: " << input; ClVarNode *node = CreateNode(input); analysis_nodes_.push(node); if (op->Type() == "fetch") { @@ -114,7 +115,7 @@ void MemoryOptPassCl::operator()( if (!IsPersistable(output) && std::find(exclude_var_names.begin(), exclude_var_names.end(), output) == exclude_var_names.end()) { - DLOG << "output: " << output; + LOG(kNO_LOG) << "output: " << output; ClVarNode *node = CreateNode(output); analysis_nodes_.push(node); } @@ -164,8 +165,8 @@ void MemoryOptPassCl::ShareData( cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue(); for (const auto &list : reused_nodes_) { - DLOG << "\n"; - DLOG << "gpu . share memory within these variables"; + LOG(kNO_LOG) << "\n"; + LOG(kNO_LOG) << "gpu . share memory within these variables"; int64_t x_based_max_numl = -1; int64_t y_based_max_numl = -1; int64_t x_based_max_x = -1; diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt index 76ddd78f1af8989eed1813380e7e1c642f0b394e..078440f45b0525ce49140ad78b2f9c23bb0f55f1 100644 --- a/mobile/test/CMakeLists.txt +++ b/mobile/test/CMakeLists.txt @@ -551,6 +551,12 @@ if (ENABLE_ALL_TEST) ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-inference-api-v2 paddle-mobile) + + if (GPU_CL) + ADD_EXECUTABLE(test-net-male2fe net/test_mobilenet_male2fe.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-net-male2fe paddle-mobile) + endif() + endif () else () # gen test diff --git a/mobile/test/executor_for_test_opencl.h b/mobile/test/executor_for_test_opencl.h old mode 100755 new mode 100644 diff --git a/mobile/test/net/test_inference_api_v2.cpp b/mobile/test/net/test_inference_api_v2.cpp old mode 100755 new mode 100644 diff --git a/mobile/test/net/test_mobilenet_male2fe.cpp b/mobile/test/net/test_mobilenet_male2fe.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb83b5bafe73a52c88a2408715eb4ffd2dff4676 --- /dev/null +++ b/mobile/test/net/test_mobilenet_male2fe.cpp @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../../src/common/types.h" +#include "../test_helper.h" +#include "../test_include.h" + +void feed(PaddleMobile *paddle_mobile, const DDim &dims, + std::string image_path, std::string feed_name) { + float *input_data_array = new float[product(dims)]; + std::ifstream in(image_path, std::ios::in); + for (int i = 0; i < product(dims); i++) { + float num; + in >> num; + input_data_array[i] = num; + } + in.close(); + framework::Tensor input_tensor(input_data_array, dims); + DLOG << feed_name << " : " << input_tensor; + paddle_mobile->Feed(feed_name, input_tensor); +} + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + auto time1 = paddle_mobile::time(); +#ifdef PADDLE_MOBILE_CL + paddle_mobile.SetCLPath("/data/local/tmp/bin"); +#endif + + if (paddle_mobile.Load(std::string("../models/nanbiannv") + "/model", + std::string("../models/nanbiannv") + "/params", + true)) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + feed(&paddle_mobile, {1, 3, 256, 256}, "../images/input_1_3_256_256", + "image"); + + auto time3 = paddle_mobile::time(); + paddle_mobile.Predict(); + auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) + << "ms" << std::endl; + } + + auto rgb = paddle_mobile.Fetch("rgb"); + auto mask = paddle_mobile.Fetch("mask"); + LOG(kLOG_INFO) << "rgb" << *rgb; + LOG(kLOG_INFO) << "mask" << *mask; + return 0; +} diff --git a/mobile/test/net/test_net_multi_feed.cpp b/mobile/test/net/test_net_multi_feed.cpp old mode 100755 new mode 100644 diff --git a/mobile/test/operators/test_expend_op.cpp b/mobile/test/operators/test_expend_op.cpp old mode 100755 new mode 100644 diff --git a/mobile/tools/python/fluidtools/run_multi_feed.py b/mobile/tools/python/fluidtools/run_multi_feed.py old mode 100755 new mode 100644